
    )j                     d   d dl mZ d dlmZmZmZmZ d dlmZ	 d dl
mZ d dlmZ ddlmZ ddlmZmZmZ ddlmZ e G d	 d
e                      Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  ZdS )    )	dataclass)AnyDictOptionalUnionN)shard_linear   )swiglu)BaseModelArgscreate_attention_maskscaled_dot_product_attention)initialize_ropec                       e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed<   d	Zeed
<   dZeed<   dZ	e
ed<   dZeeeeeef         f                  ed<   dZe
ed<   dS )	ModelArgs
model_typehidden_sizenum_hidden_layersintermediate_sizenum_attention_headsrms_norm_eps
vocab_sizenum_key_value_headsi   max_position_embeddingsi@B 
rope_thetaFrope_traditionalNrope_scalingTtie_word_embeddings)__name__
__module____qualname__str__annotations__intfloatr   r   r   boolr   r   r   r   r        ]/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/qwen2.pyr   r      s         OOOOOO#(S(((J"d""";?L(4U5#:%6 678??? $$$$$$r'   r   c            	       x     e Zd Zdef fdZ	 	 d	dej        deej                 dee         dej        fdZ	 xZ
S )
	Attentionargsc                    t                                                       |j        }|j        x| _        }|j        J |j        x| _        }|j        |z  }|dz  | _        t          j	        |||z  d          | _
        t          j	        |||z  d          | _        t          j	        |||z  d          | _        t          j	        ||z  |d          | _        t          ||j        |j        |j        |j                  | _        d S )Ng      TbiasF)basetraditionalscaling_configr   )super__init__r   r   n_headsr   
n_kv_headsscalennLinearq_projk_projv_projo_projr   r   r   r   r   rope)selfr+   dimr4   r5   head_dim	__class__s         r(   r3   zAttention.__init__!   s   !%!99w'333'+'??*#w.t^
iWx%7dCCCiZ(%:FFFiZ(%:FFFi( 2CeDDD#-,$($@
 
 
			r'   Nxmaskcachereturnc                 |   |j         \  }}}|                     |          |                     |          |                     |          }	}}|                    ||| j        d                              dddd          }|                    ||| j        d                              dddd          }|	                    ||| j        d                              dddd          }	|R|                     ||j	                  }|                     ||j	                  }|
                    ||	          \  }}	n*|                     |          }|                     |          }t          |||	|| j        |          }
|
                    dddd                              ||d          }
|                     |
          S )Nr      r	      )offset)rD   r6   rC   )shaper9   r:   r;   reshaper4   	transposer5   r=   rJ   update_and_fetchr   r6   r<   )r>   rB   rC   rD   BLDquerieskeysvaluesoutputs              r(   __call__zAttention.__call__9   s    '1a $AAAv //!Qb99CCAq!QOO||Aq$/266@@Aq!LL1dor::DDQ1aPPiii==G99T%,977D 11$??LD&&ii((G99T??D-T6djt
 
 
 !!!Q1--55aB??{{6"""r'   NNr   r   r    r   r3   mxarrayr   r   rV   __classcell__rA   s   @r(   r*   r*       s        
Y 
 
 
 
 
 
6 $(#	# #8# rx # }	#
 
# # # # # # # #r'   r*   c                   4     e Zd Z fdZdej        fdZ xZS )MLPc                     t                                                       t          j        ||d          | _        t          j        ||d          | _        t          j        ||d          | _        d S NFr-   )r2   r3   r7   r8   	gate_proj	down_projup_proj)r>   r?   
hidden_dimrA   s      r(   r3   zMLP.__init__X   se    3
???:s???yju===r'   rE   c                     |                      t          |                     |          |                     |                              S N)rb   r
   ra   rc   )r>   rB   s     r(   rV   zMLP.__call__^   s4    ~~fT^^A%6%6QHHIIIr'   )r   r   r    r3   rY   rZ   rV   r[   r\   s   @r(   r^   r^   W   s^        > > > > >JRX J J J J J J J Jr'   r^   c            	       x     e Zd Zdef fdZ	 	 d	dej        deej                 dee         dej        fdZ	 xZ
S )
TransformerBlockr+   c                    t                                                       |j        | _        |j        | _        t	          |          | _        t          |j        |j                  | _        t          j
        |j        |j                  | _        t          j
        |j        |j                  | _        || _        d S )Neps)r2   r3   r   r   r*   	self_attnr^   r   mlpr7   RMSNormr   input_layernormpost_attention_layernormr+   r>   r+   rA   s     r(   r3   zTransformerBlock.__init__c   s    #'#; +"4t')?@@!z$*:@QRRR(*
$"3)
 )
 )
% 			r'   NrB   rC   rD   rE   c                     |                      |                     |          ||          }||z   }|                     |                     |                    }||z   }|S rf   )rl   ro   rm   rp   )r>   rB   rC   rD   rhouts          r(   rV   zTransformerBlock.__call__o   s]     NN4//22D%@@EHHT2215566!e
r'   rW   rX   r\   s   @r(   rh   rh   b   s        
Y 
 
 
 
 
 
 $(#	
 
8
 rx 
 }	

 

 
 
 
 
 
 
 
r'   rh   c                   Z     e Zd Zdef fdZ	 	 ddej        deej                 fdZ xZ	S )
Qwen2Modelr+   c                 ~   t                                                       | _        j        | _        j        | _        | j        dk    sJ t          j        j        j                  | _        fdt          j                  D             | _
        t          j        j        j                  | _        d S )Nr   c                 0    g | ]}t                     S ))r+   )rh   ).0_r+   s     r(   
<listcomp>z'Qwen2Model.__init__.<locals>.<listcomp>   s2     
 
 
,-$'''
 
 
r'   rj   )r2   r3   r+   r   r   r7   	Embeddingr   embed_tokensrangelayersrn   r   normrq   s    `r(   r3   zQwen2Model.__init__}   s    	/!%!7""""L$:JKK
 
 
 
16t7M1N1N
 
 
 Jt/T5FGGG			r'   Ninputsinput_embeddingsc                    ||}n|                      |          }|d gt          | j                  z  }t          ||d                   }t	          | j        |          D ]\  }} ||||          }|                     |          S )Nr   )r~   lenr   r   zipr   )r>   r   rD   r   rt   rC   layercs           r(   rV   zQwen2Model.__call__   s     ' AA!!&))A=FS---E$Qa11DK// 	" 	"HE1aq!!AAyy||r'   rW   )
r   r   r    r   r3   rY   rZ   r   rV   r[   r\   s   @r(   rw   rw   |   s        
HY 
H 
H 
H 
H 
H 
H /3	  #28,	       r'   rw   c                        e Zd Zdef fdZ	 	 ddej        deej                 fdZd Z	ddeej
        j                 fd	Zed
             Z xZS )Modelr+   c                     t                                                       || _        |j        | _        t	          |          | _        |j        s(t          j        |j	        |j
        d          | _        d S d S r`   )r2   r3   r+   r   rw   modelr   r7   r8   r   r   lm_headrq   s     r(   r3   zModel.__init__   sq    	/%%
' 	T9T%5tUSSSDLLL	T 	Tr'   Nr   r   c                     |                      |||          }| j        j        r | j         j                            |          }n|                     |          }|S rf   )r   r+   r   r~   	as_linearr   )r>   r   rD   r   ru   s        r(   rV   zModel.__call__   sW     jj(8999( 	$*)33C88CC,,s##C
r'   c                     | j         j        r|                    dd            d |                                D             S )Nzlm_head.weightc                 "    i | ]\  }}d |v	||S )zself_attn.rotary_emb.inv_freqr&   )rz   kvs      r(   
<dictcomp>z"Model.sanitize.<locals>.<dictcomp>   s1     
 
 
Q0OWX0X0XAq0X0X0Xr'   )r+   r   popitems)r>   weightss     r(   sanitizezModel.sanitize   sM    9( 	0KK($///
 
$]]__
 
 
 	
r'   groupc                    |pt           j                                        }|                                }| j        j        D ]7}t          |j        j        d|          |j        _        t          |j        j	        d|          |j        _	        t          |j        j
        d|          |j        _
        t          |j        j        d|          |j        _        |j        xj        |z  c_        |j        xj        |z  c_        t          |j        j        d|          |j        _        t          |j        j        d|          |j        _        t          |j        j        d|          |j        _        9d S )Nzall-to-sharded)r   zsharded-to-all)rY   distributedinitsizer   r   r   rl   r9   r:   r;   r<   r4   r5   rm   ra   rb   rc   )r>   r   Nr   s       r(   shardzModel.shard   s   .,,..JJLLZ& 	 	E%1&(8& & &EO" &2&(8& & &EO" &2&(8& & &EO" &2&(8& & &EO" O##)##O&&1,&& #/	#%5U# # #EI #/	#%5U# # #EI !-	!#35! ! !EI1	 	r'   c                     | j         j        S rf   )r   r   )r>   s    r(   r   zModel.layers   s    z  r'   rW   rf   )r   r   r    r   r3   rY   rZ   r   rV   r   r   Groupr   propertyr   r[   r\   s   @r(   r   r      s        TY T T T T T T /3	  #28,	   
 
 
 8BN$89    > ! ! X! ! ! ! !r'   r   )dataclassesr   typingr   r   r   r   mlx.corecorerY   mlx.nnr7   mlx.nn.layers.distributedr   activationsr
   r/   r   r   r   
rope_utilsr   r   Moduler*   r^   rh   rw   r   r&   r'   r(   <module>r      s   " ! ! ! ! ! - - - - - - - - - - - -             2 2 2 2 2 2       T T T T T T T T T T ' ' ' ' ' ' % % % % % % % % 4# 4# 4# 4# 4#	 4# 4# 4#nJ J J J J") J J J    ry   4       D?! ?! ?! ?! ?!BI ?! ?! ?! ?! ?!r'   