
    )j                     d   d dl mZ d dlmZmZmZmZ d dlmZ	 d dl
mZ d dlmZ ddlmZ ddlmZmZmZ ddlmZ e G d	 d
e                      Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  ZdS )    )	dataclass)AnyDictOptionalUnionN)shard_linear   )swiglu)BaseModelArgscreate_attention_maskscaled_dot_product_attention)initialize_ropec                       e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed	<   eed
<   eed<   eed<   dZe	e
eeeef         f                  ed<   dS )	ModelArgs
model_typehidden_sizenum_hidden_layersintermediate_sizenum_attention_headsrms_norm_eps
vocab_sizenum_key_value_headsmax_position_embeddings
rope_thetahead_dimtie_word_embeddingsNrope_scaling)__name__
__module____qualname__str__annotations__intfloatboolr   r   r   r        ]/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/qwen3.pyr   r      s         OOOOOO    MMM;?L(4U5#:%6 678?????r'   r   c            	       x     e Zd Zdef fdZ	 	 d	dej        deej                 dee         dej        fdZ	 xZ
S )
	Attentionargsc                    t                                                       |j        }|j        x| _        }|j        J |j        x| _        }|j        }|dz  | _        t          j
        |||z  d          | _        t          j
        |||z  d          | _        t          j
        |||z  d          | _        t          j
        ||z  |d          | _        t          j        ||j                  | _        t          j        ||j                  | _        t'          ||j        d|j        |j                  | _        d S )Ng      Fbiaseps)basetraditionalscaling_configr   )super__init__r   r   n_headsr   
n_kv_headsr   scalennLinearq_projk_projv_projo_projRMSNormr   q_normk_normr   r   r   r   rope)selfr+   dimr6   r7   r   	__class__s         r(   r5   zAttention.__init__!   s7   !%!99w'333'+'??*=t^
iWx%7eDDDiZ(%:GGGiZ(%:GGGi( 2CeDDDjt/@AAAjt/@AAA#,$($@
 
 
			r'   Nxmaskcachereturnc                    |j         \  }}}|                     |          |                     |          |                     |          }	}}|                     |                    ||| j        d                                        dddd          }|                     |                    ||| j	        d                                        dddd          }|	                    ||| j	        d                              dddd          }	|R| 
                    ||j                  }| 
                    ||j                  }|                    ||	          \  }}	n*| 
                    |          }| 
                    |          }t          |||	|| j        |          }
|
                    dddd                              ||d          }
|                     |
          S )Nr      r	      )offset)rH   r8   rG   )shaper;   r<   r=   r@   reshaper6   	transposerA   r7   rB   rN   update_and_fetchr   r8   r>   )rC   rF   rG   rH   BLDquerieskeysvaluesoutputs              r(   __call__zAttention.__call__;   s    '1a $AAAv++gooaDL"EEFFPPq!Q
 
 {{4<<1dorBBCCMMq!Q
 
 1dor::DDQ1aPPiii==G99T%,977D 11$??LD&&ii((G99T??D-T6djt
 
 
 !!!Q1--55aB??{{6"""r'   NNr   r   r    r   r5   mxarrayr   r   rZ   __classcell__rE   s   @r(   r*   r*       s        
Y 
 
 
 
 
 
: $(#	# #8# rx # }	#
 
# # # # # # # #r'   r*   c                   4     e Zd Z fdZdej        fdZ xZS )MLPc                     t                                                       t          j        ||d          | _        t          j        ||d          | _        t          j        ||d          | _        d S NFr-   )r4   r5   r9   r:   	gate_proj	down_projup_proj)rC   rD   
hidden_dimrE   s      r(   r5   zMLP.__init__]   se    3
???:s???yju===r'   rI   c                     |                      t          |                     |          |                     |                              S N)rf   r
   re   rg   )rC   rF   s     r(   rZ   zMLP.__call__c   s4    ~~fT^^A%6%6QHHIIIr'   )r   r   r    r5   r]   r^   rZ   r_   r`   s   @r(   rb   rb   \   s^        > > > > >JRX J J J J J J J Jr'   rb   c            	       x     e Zd Zdef fdZ	 	 d	dej        deej                 dee         dej        fdZ	 xZ
S )
TransformerBlockr+   c                    t                                                       |j        | _        |j        | _        t	          |          | _        t          |j        |j                  | _        t          j
        |j        |j                  | _        t          j
        |j        |j                  | _        || _        d S )Nr/   )r4   r5   r   r   r*   	self_attnrb   r   mlpr9   r?   r   input_layernormpost_attention_layernormr+   rC   r+   rE   s     r(   r5   zTransformerBlock.__init__h   s    #'#; +"4t')?@@!z$*:@QRRR(*
$"3)
 )
 )
% 			r'   NrF   rG   rH   rI   c                     |                      |                     |          ||          }||z   }|                     |                     |                    }||z   }|S rj   )rn   rp   ro   rq   )rC   rF   rG   rH   rhouts          r(   rZ   zTransformerBlock.__call__t   s]     NN4//22D%@@EHHT2215566!e
r'   r[   r\   r`   s   @r(   rl   rl   g   s        
Y 
 
 
 
 
 
 $(#	
 
8
 rx 
 }	

 

 
 
 
 
 
 
 
r'   rl   c                   Z     e Zd Zdef fdZ	 	 ddej        deej                 fdZ xZ	S )
Qwen3Modelr+   c                 ~   t                                                       | _        j        | _        j        | _        | j        dk    sJ t          j        j        j                  | _        fdt          j                  D             | _
        t          j        j        j                  | _        d S )Nr   c                 0    g | ]}t                     S ))r+   )rl   ).0_r+   s     r(   
<listcomp>z'Qwen3Model.__init__.<locals>.<listcomp>   s2     
 
 
,-$'''
 
 
r'   r/   )r4   r5   r+   r   r   r9   	Embeddingr   embed_tokensrangelayersr?   r   normrr   s    `r(   r5   zQwen3Model.__init__   s    	/!%!7""""L$:JKK
 
 
 
16t7M1N1N
 
 
 Jt/T5FGGG			r'   Ninputsinput_embeddingsc                    ||}n|                      |          }|d gt          | j                  z  }t          ||d                   }t	          | j        |          D ]\  }} ||||          }|                     |          S )Nr   )r   lenr   r   zipr   )rC   r   rH   r   ru   rG   layercs           r(   rZ   zQwen3Model.__call__   s     ' AA!!&))A=FS---E$Qa11DK// 	" 	"HE1aq!!AAyy||r'   r[   )
r   r   r    r   r5   r]   r^   r   rZ   r_   r`   s   @r(   rx   rx      s        
HY 
H 
H 
H 
H 
H 
H /3	  #28,	       r'   rx   c                        e Zd Zdef fdZ	 	 ddej        deej                 fdZd Z	ddeej
        j                 fd	Zed
             Z xZS )Modelr+   c                     t                                                       || _        |j        | _        t	          |          | _        |j        s(t          j        |j	        |j
        d          | _        d S d S rd   )r4   r5   r+   r   rx   modelr   r9   r:   r   r   lm_headrr   s     r(   r5   zModel.__init__   sq    	/%%
' 	T9T%5tUSSSDLLL	T 	Tr'   Nr   r   c                     |                      |||          }| j        j        r | j         j                            |          }n|                     |          }|S rj   )r   r+   r   r   	as_linearr   )rC   r   rH   r   rv   s        r(   rZ   zModel.__call__   sW     jj(8999( 	$*)33C88CC,,s##C
r'   c                 J    | j         j        r|                    dd            |S )Nzlm_head.weight)r+   r   pop)rC   weightss     r(   sanitizezModel.sanitize   s)    9( 	0KK($///r'   groupc                    |pt           j                                        }|                                }| j        j        D ]7}t          |j        j        d|          |j        _        t          |j        j	        d|          |j        _	        t          |j        j
        d|          |j        _
        t          |j        j        d|          |j        _        |j        xj        |z  c_        |j        xj        |z  c_        t          |j        j        d|          |j        _        t          |j        j        d|          |j        _        t          |j        j        d|          |j        _        9d S )Nzall-to-sharded)r   zsharded-to-all)r]   distributedinitsizer   r   r   rn   r;   r<   r=   r>   r6   r7   ro   re   rf   rg   )rC   r   Nr   s       r(   shardzModel.shard   s   .,,..JJLLZ& 	 	E%1&(8& & &EO" &2&(8& & &EO" &2&(8& & &EO" &2&(8& & &EO" O##)##O&&1,&& #/	#%5U# # #EI #/	#%5U# # #EI !-	!#35! ! !EI1	 	r'   c                     | j         j        S rj   )r   r   )rC   s    r(   r   zModel.layers   s    z  r'   r[   rj   )r   r   r    r   r5   r]   r^   r   rZ   r   r   Groupr   propertyr   r_   r`   s   @r(   r   r      s        TY T T T T T T /3	  #28,	     
 8BN$89    > ! ! X! ! ! ! !r'   r   )dataclassesr   typingr   r   r   r   mlx.corecorer]   mlx.nnr9   mlx.nn.layers.distributedr   activationsr
   r1   r   r   r   
rope_utilsr   r   Moduler*   rb   rl   rx   r   r&   r'   r(   <module>r      s   " ! ! ! ! ! - - - - - - - - - - - -             2 2 2 2 2 2       T T T T T T T T T T ' ' ' ' ' ' @ @ @ @ @ @ @ @ 9# 9# 9# 9# 9#	 9# 9# 9#xJ J J J J") J J J    ry   4       D<! <! <! <! <!BI <! <! <! <! <!r'   