
    )j                     b   d dl Z d dlmZ d dlmZ d dlmZ ddlm	Z	 ddl
mZmZmZ e G d de                      Z G d d	ej                  Z G d
 dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  ZdS )    N)	dataclass   )swiglu)BaseModelArgscreate_attention_maskscaled_dot_product_attentionc                       e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed	<   eed
<   eed<   dZeed<   dZ	eed<   dS )	ModelArgs
model_type
vocab_sizehidden_sizenum_attention_headsnum_hidden_layersnum_key_value_headsintermediate_size
rope_thetause_qkv_biaspartial_rotary_factorlayer_norm_epsFuse_parallel_residualqk_layernormN)
__name__
__module____qualname__str__annotations__intfloatboolr   r        `/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/stablelm.pyr
   r
      s         OOOOOO    "'4'''L$r!   r
   c                   $     e Zd Z fdZd Z xZS )LayerNormPerHeadc                     t                                                       fdt          |          D             | _        | _        d S )Nc                 >    g | ]}t          j        d           S )F)epsbias)nn	LayerNorm).0_r'   head_dims     r"   
<listcomp>z-LayerNormPerHead.__init__.<locals>.<listcomp>"   s7     
 
 
<=BLs777
 
 
r!   )super__init__rangenormsr'   )selfr-   	num_headsr'   	__class__s    ` `r"   r0   zLayerNormPerHead.__init__    s_    
 
 
 
 
AFyAQAQ
 
 

 r!   c                     t          j        d | j        D                       }|t           j                            |d d | j                  z  S )Nc                     g | ]	}|j         
S r    )weight)r+   ns     r"   r.   z-LayerNormPerHead.__call__.<locals>.<listcomp>(   s    3331ah333r!   )mxstackr2   fast
layer_normr'   )r3   xws      r"   __call__zLayerNormPerHead.__call__'   sE    H33
3334427%%atTX>>>>r!   )r   r   r   r0   r@   __classcell__r5   s   @r"   r$   r$      sG            ? ? ? ? ? ? ?r!   r$   c                   ,     e Zd Zdef fdZddZ xZS )	Attentionconfigc                    t                                                       |j        | _        |j        | _        | j        | j        z  | _        |j        | _        |j        | _        |j        | _        | j        | j        z  | j        k    r t          d| j         d| j         d          t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j        | j        z  |j                  | _        t          j        | j        | j        z  | j        d          | _        t          j        t%          | j        | j        z            d| j                  | _        |j        | _        | j        rNt+          | j        | j        |j                  | _        t+          | j        | j        |j                  | _        d S d S )Nz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r(   F)traditionalbaser'   )r/   r0   r   r   r4   r-   r   r   r   
ValueErrorr)   Linearr   q_projk_projv_projo_projRoPEr   roper   r$   r   q_layernormk_layernormr3   rE   r5   s     r"   r0   zAttention.__init__-   s    !-3(DN:#)#=  +%+%A"MDN*t/???8RVRb 8 8%)^8 8 8  
 idnt}<6CV
 
 
 i$t}4$
 
 

 i$t}4$
 
 

 iNT]*D,<5
 
 
 G*T]:;;
 
 
	 #/ 	/t~63H     D  0t7V=R     D		 	r!   Nc                    |                      |          |                     |          |                     |          }}}|j        \  }}}	|                    ||| j        d          }|                    ||| j        d          }| j        r*|                     |          }| 	                    |          }|
                    dddd          }|
                    dddd          }|                    ||| j        d          
                    dddd          }|R|                     ||j                  }|                     ||j                  }|                    ||          \  }}n*|                     |          }|                     |          }|                    t          j                  }|                    t          j                  }t#          j        d|j        d         z            }
t'          |||||
|                              |j                  }|
                    dddd                              ||d          }|                     |          S )Nr      r      )offset)cachescalemask)rM   rN   rO   shapereshaper4   r   r   rS   rT   	transposerR   rZ   update_and_fetchastyper:   float32mathsqrtr   dtyperP   )r3   r>   r]   r[   querieskeysvaluesBLDr\   outputs               r"   r@   zAttention.__call__]   s)    $AAAv -1a//!Q;;||Aq$":B?? 	*&&w//G##D))D##Aq!Q//~~aAq))1d&>CCMMq!Q
 

 iii==G99T%,977D 11$??LD&&ii((G99T??D..,,{{2:&& 	!gmB//00-T6e$
 
 

&

 	 !!!Q1--55aB??{{6"""r!   )NNr   r   r   r
   r0   r@   rA   rB   s   @r"   rD   rD   ,   sY        .y . . . . . .`## ## ## ## ## ## ## ##r!   rD   c                   4     e Zd Z fdZdej        fdZ xZS )MLPc                     t                                                       t          j        ||d          | _        t          j        ||d          | _        t          j        ||d          | _        d S NFrG   )r/   r0   r)   rL   	gate_proj	down_projup_proj)r3   dim
hidden_dimr5   s      r"   r0   zMLP.__init__   se    3
???:s???yju===r!   returnc                     |                      t          |                     |          |                     |                              S N)rt   r   rs   ru   )r3   r>   s     r"   r@   zMLP.__call__   s4    ~~fT^^A%6%6QHHIIIr!   )r   r   r   r0   r:   arrayr@   rA   rB   s   @r"   rp   rp      s^        > > > > >JRX J J J J J J J Jr!   rp   c                   *     e Zd Zdef fdZd Z xZS )DecoderLayerrE   c                 n   t                                                       t          |          | _        t	          |j        |j                  | _        t          j	        |j        |j
                  | _        |j        | _        | j        s't          j	        |j        |j
                  | _        d S d S )N)rE   rJ   )r/   r0   rD   	self_attnrp   r   r   mlpr)   r*   r   input_layernormr   post_attention_layernormrU   s     r"   r0   zDecoderLayer.__init__   s    "&111v)6+CDD!|% 
  
  
 &,%A") 	,.L")- - -D)))	 	r!   c                    |                      |          }|                     |||          }| j        r||z   |                     |          z   }n2||z   }|                     |                     |                    }||z   }|S rz   )r   r   r   r   r   )r3   r>   r]   r[   hrouts          r"   r@   zDecoderLayer.__call__   s      ##NN1dE**% 	a%$((1++%CCAA66q99::Aa%C
r!   rn   rB   s   @r"   r}   r}      sS        y      
 
 
 
 
 
 
r!   r}   c                   *     e Zd Zdef fdZd Z xZS )StableLMrE   c                 &   t                                                       t          j        j        j                  | _        fdt          j                  D             | _	        t          j
        j        j                  | _        d S )Nc                 .    g | ]}t                    S r    )r}   )r+   irE   s     r"   r.   z%StableLM.__init__.<locals>.<listcomp>   s!    UUU|F++UUUr!   rJ   )r/   r0   r)   	Embeddingr   r   embed_tokensr1   r   layersr*   r   normrU   s    `r"   r0   zStableLM.__init__   sz    L):F<NOOUUUUU6;S5T5TUUUL!39NOOO			r!   c                    |                      |          }|d gt          | j                  z  }t          ||d                   }t	          | j        |          D ]\  }} ||||          }|                     |          S )Nr   )r[   )r   lenr   r   zipr   )r3   r>   r[   r]   layercs         r"   r@   zStableLM.__call__   s    a  =FS---E$Qa11DK// 	( 	(HE1aQ'''AAyy||r!   rn   rB   s   @r"   r   r      sZ        Py P P P P P P
 
 
 
 
 
 
r!   r   c                   b     e Zd Zdef fdZ	 ddej        dej        fdZed             Z	 xZ
S )	ModelrE   c                     t                                                       |j        | _        t          |          | _        t          j        |j        |j        d          | _	        || _
        d S rr   )r/   r0   r   r   modelr)   rL   r   r   lm_headargsrU   s     r"   r0   zModel.__init__   s[     +f%%
y!3V5FUSSS			r!   Nr>   rx   c                 X    |                      ||          }|                     |          S rz   )r   r   )r3   r>   r[   ys       r"   r@   zModel.__call__   s'    
 JJq%  ||Ar!   c                     | j         j        S rz   )r   r   )r3   s    r"   r   zModel.layers   s    z  r!   rz   )r   r   r   r
   r0   r:   r{   r@   propertyr   rA   rB   s   @r"   r   r      s        y        8 
	    ! ! X! ! ! ! !r!   r   )rd   dataclassesr   mlx.corecorer:   mlx.nnr)   activationsr   rI   r   r   r   r
   Moduler$   rD   rp   r}   r   r   r    r!   r"   <module>r      s    ! ! ! ! ! !                   T T T T T T T T T T         ? ? ? ? ?ry ? ? ?T# T# T# T# T#	 T# T# T#nJ J J J J") J J J    29   :    ry   (! ! ! ! !BI ! ! ! ! !r!   