
    j                        d dl mZmZmZmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZ  G d d          Z G d de          Z G d	 d
e          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Zd ZdS )    )CallableListOptionalTupleUnionN)Module)tree_flattentree_map
tree_mergetree_reducetree_unflattenc                   v   e Zd ZdZddZdedefdZdefdZd	e	j
        d
efdZdedefdZde	j
        d	e	j
        d
efdZed             Zej        d
efd            Zed             Zed             Zej        deee	j
        f         fd            Zdedeeee	j
        ge	j
        f         f         fdZdS )	OptimizerzThe base class for all optimizers. It allows us to implement an
    optimizer on a per-parameter basis and apply it to a parameter tree.
    Nc                     d| _         dt          j        dt          j                  i| _        d |pi                                 D             | _        d S )NFstepr   c                     i | ]\  }}||	S  r   ).0kvs      c/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx/optimizers/optimizers.py
<dictcomp>z&Optimizer.__init__.<locals>.<dictcomp>   s    HHHTQAqHHH    )_initializedmxarrayuint64_stateitems_schedulers)self
schedulerss     r   __init__zOptimizer.__init__   sQ    !rx29556HHj.>B-E-E-G-GHHHr   model	gradientsc                 X    |                     |                     ||                     dS )aP  Apply the gradients to the parameters of the model and update the
        model with the new parameters.

        Args:
            model (mlx.nn.Module): An mlx module to be updated.
            gradients (dict): A Python tree of gradients, most likely computed
                              via :func:`mlx.nn.value_and_grad`.
        N)updateapply_gradients)r!   r$   r%   s      r   r'   zOptimizer.update   s,     	T)))U;;<<<<<r   
parametersc                 v     fd | j                    t           fd| j                    d _        dS )aA  Initialize the optimizer's state

        This function can be used to initialize optimizers which have state
        (like momentum in :class:`SGD`). Using this method is optional as the
        optimizer will initialize itself if the state is not yet set. However,
        there are some cases where explicit initialization is useful in order
        to have access to the :attr:`Optimizer.state` before the first call to
        :meth:`Optimizer.update`.

        Args:
            model (dict): A Python tree of parameters.

        Example:
            >>> optimizer = optim.SGD(learning_rate=1e-1, momentum=0.9)
            >>> model = nn.Linear(2, 2)
            >>> optimizer.init(model.trainable_parameters())
            >>> optimizer.state.keys()
            dict_keys(['step', 'learning_rate', 'weight', 'bias'])
        c           	      r   t          | t          t          f          rt          |          }t          t	          |                    D ]} | |         ||                   ||<   t	          |          t	          |           k    r9|                    t          d | t	          |          d                                t          |           |          S t          | t                    rJ| 	                                D ]3\  }}||vrt          d |          ||<    |||                   ||<   4|S |S )Nc                     i S Nr   _s    r   <lambda>z6Optimizer.init.<locals>.update_state.<locals>.<lambda>;   s    B r   c                     i S r-   r   r.   s    r   r0   z6Optimizer.init.<locals>.update_state.<locals>.<lambda>@   s    b r   )

isinstancelisttuplerangelenextendr
   typedictr   )paramsstateir   r   update_states        r   r=   z$Optimizer.init.<locals>.update_state5   s.   &4-00 Us5zz** A AA+|F1IuQx@@E!HHu::V,,LL,,s5zz||8L!M!MNNN#tF||E***FD)) "LLNN = =DAq~~#+LL!#<#<a#/<58#<#<ar   c                 4    |p                     | |          S r-   )init_single)psr!   s     r   r0   z Optimizer.init.<locals>.<lambda>H   s    a94#3#3Aq#9#9 r   TN)r   r
   r   )r!   r)   r=   s   ` @r   initzOptimizer.init   s_    ,	 	 	 	 	$ 	Z---9999:t{SSS r   	parameterr;   c                     t                      )zTo be extended by the children classes to implement each optimizer's
        state initialization.

        Args:
            parameter (mx.array): A single parameter that will be optimized.
            state (dict): The optimizer's state.
        NotImplementedErrorr!   rC   r;   s      r   r?   zOptimizer.init_singleK        "###r   c                    | j         s|                     |           | j                                        D ]\  }} || j                  | j        |<   | j        dz   | j        d<   t          | j        ||| j                  S )a  Apply the gradients to the parameters and return the updated parameters.

        Can be used to update a model via
        ``model.update(opt.apply_gradients(grads, model))`` which is precisely
        how :meth:`Optimizer.update` is implemented.

        Args:
            gradients (dict): A Python tree of gradients.
            parameters (dict): A Python tree of parameters. It can be a
              superset of the gradients. In that case the returned python
              tree will be of the same structure as the gradients.
           r   )r   rB   r    r   r   r;   r
   apply_single)r!   r%   r)   param	schedulers        r   r(   zOptimizer.apply_gradientsU   s       	!IIi    !% 0 6 6 8 8 	5 	5E9 )	$) 4 4DJu "Y]
6 )9j$*MMMr   gradientc                     t                      )a  To be extended by derived classes to implement the optimizer's update.

        Args:
            gradient (mx.array): The ``parameter`` gradient.
            parameter (mx.array): The ``parameter`` to update.
            state (dict): The optimizer's state.
        rE   )r!   rN   rC   r;   s       r   rK   zOptimizer.apply_singleo   rH   r   c                     | j         S )z!The optimizer's state dictionary.)r   r!   s    r   r;   zOptimizer.statey   s     {r   c                 "    d| _         || _        d S )NF)r   r   )r!   r;   s     r   r;   zOptimizer.state~   s    !r   c                     | j         d         S )Nr   r;   rQ   s    r   r   zOptimizer.step   s    z&!!r   c                     | j         d         S Nlearning_raterT   rQ   s    r   rW   zOptimizer.learning_rate   s    z/**r   rW   c                 >    t          j        |          | j        d<   d S rV   )r   r   r;   )r!   rW   s     r   rW   zOptimizer.learning_rate   s    &(h}&=&=
?###r   namerL   c                     t          |t                    r|| j        |<    || j                  }nt	          j        |          }|| j        |<   dS )z\
        To be used by derived classes to optionally put a parameter on a schedule.
        N)r2   r   r    r   r   r   r;   )r!   rY   rL   rC   s       r   _maybe_schedulezOptimizer._maybe_schedule   sU     eX&& 	(%*DT"di((III$
4r   r-   )__name__
__module____qualname____doc__r#   r   r9   r'   rB   r   r   r?   r(   rK   propertyr;   setterr   rW   r   floatstrr   r[   r   r   r   r   r   
   s        I I I I
	=F 	=t 	= 	= 	= 	=*!t *! *! *! *!X$RX $d $ $ $ $N N4 N N N N4$RX $"( $4 $ $ $ $   X \4    \ " " X" + + X+ >5+A > > > >%% %eXrxj"(6J-K&K L% % % % % %r   r   c                        e Zd ZdZg fdef fdZdefdZdefdZdedefdZ	e
d	             Zej        d
efd            Ze
d             Zej        deeej        f         fd            Z xZS )MultiOptimizeral  Wraps a list of optimizers with corresponding weight predicates/filters
    to make it easy to use different optimizers for different weights.

    The predicates take the full "path" of the weight and the weight itself and
    return True if it should be considered for this optimizer. The last
    optimizer in the list is a fallback optimizer and no predicate should be
    given for it.

    Args:
        optimizers (list[Optimizer]): A list of optimizers to delegate to
        filters (list[Callable[[str, array], bool]): A list of predicates that
            should be one less than the provided optimizers.
    filtersc                 (   t                                                       i | _        t          |          t          |          dz
  k    r3t	          dt          |           dt          |          dz
   d          || _        |d gz   | _        d S )NrJ   zGiven z filters but z needed.c                      dS )NTr   )argskwargss     r   r0   z)MultiOptimizer.__init__.<locals>.<lambda>   s    $ r   )superr#   r   r6   
ValueError
optimizersrf   )r!   rm   rf   	__class__s      r   r#   zMultiOptimizer.__init__   s    w<<3z??Q...OWOOC
OOA4EOOO   %">">!??r   r%   c                 ^   t          | j                  dk    r|gS d t          t          | j                            D             }t          |          }|D ]J\  }}t	          | j                  D ]0\  }} |||          r||                             ||f            n1Kd |D             S )NrJ   c                     g | ]}g S r   r   )r   r/   s     r   
<listcomp>z4MultiOptimizer._split_dictionary.<locals>.<listcomp>   s    999999r   c                 ,    g | ]}t          |          S r   )r   )r   r@   s     r   rq   z4MultiOptimizer._split_dictionary.<locals>.<listcomp>   s     111aq!!111r   )r6   rm   r5   r	   	enumeraterf   append)r!   r%   partsflat_gradientsr   gr<   fns           r   _split_dictionaryz MultiOptimizer._split_dictionary   s    t1$$;99U3t#7#788999%i00" 	 	DAq"4<00  22a88 !HOOQF+++E 2151111r   r)   c                     t          | j        |                     |                    D ]\  }}|                    |           d S r-   )ziprm   ry   rB   )r!   r)   or@   s       r   rB   zMultiOptimizer.init   sJ    )?)?
)K)KLL 	 	DAqFF1IIII	 	r   c                     i }t          | j        |                     |                    D ])\  }}t          ||                    ||                    }*|S r-   )r{   rm   ry   r   r(   )r!   r%   r)   treer|   rw   s         r   r(   zMultiOptimizer.apply_gradients   s\    )?)?	)J)JKK 	F 	FDAqdA$5$5a$D$DEEDDr   c                 (    dd | j         D             iS )Nstatesc                     g | ]	}|j         
S r   rT   )r   r|   s     r   rq   z(MultiOptimizer.state.<locals>.<listcomp>   s    <<<q17<<<r   )rm   rQ   s    r   r;   zMultiOptimizer.state   s    <<DO<<<==r   r;   c                     d|vs+t          |d                   t          | j                  k    rt          d          t          | j        |d                   D ]\  }}||_        d S )Nr   zInvalid state provided)r6   rm   rl   r{   r;   )r!   r;   r|   rA   s       r   r;   zMultiOptimizer.state   sr    5  Ch$8$8C<P<P$P$P5666x99 	 	DAqAGG	 	r   c                 &    | j         d         j        S )Nr   rm   rW   rQ   s    r   rW   zMultiOptimizer.learning_rate   s    q!//r   rW   c                 (    | j         D ]	}||_        
d S r-   r   )r!   rW   r|   s      r   rW   zMultiOptimizer.learning_rate   s&     	, 	,A+AOO	, 	,r   )r\   r]   r^   r_   r3   r#   r9   ry   rB   r(   r`   r;   ra   rW   r   rb   r   r   __classcell__rn   s   @r   re   re      sM         46 
@ 
@D 
@ 
@ 
@ 
@ 
@ 
@24 2 2 2 2t     4     > > X> \4    \ 0 0 X0 ,5+A , , , , , , , ,r   re   c                        e Zd ZdZ	 	 	 	 ddeeeej        gej        f         f         dededede	f
 fd	Z
d
ej        defdZdej        d
ej        defdZ xZS )SGDa  The stochastic gradient descent optimizer.

    Updates a parameter :math:`w` with a gradient :math:`g` as follows

    .. math::

        v_{t+1} &= \mu v_t + (1 - \tau) g_t \\
        w_{t+1} &= w_t - \lambda v_{t+1}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        momentum (float, optional): The momentum strength :math:`\mu`. Default: ``0``
        weight_decay (float, optional): The weight decay (L2 penalty). Default: ``0``
        dampening (float, optional): Dampening for momentum :math:`\tau`. Default: ``0``
        nesterov (bool, optional): Enables Nesterov momentum. Default: ``False``
            FrW   momentumweight_decay	dampeningnesterovc                     |r|dk    s|dk    rt          d          t                                                       |                     d|           || _        || _        || _        || _        d S )Nr   z9Nesterov momentum requires a momentum and zero dampening.rW   )rl   rk   r#   r[   r   r   r   r   )r!   rW   r   r   r   r   rn   s         r   r#   zSGD.__init__   s      	Q)q..K   	_m<<< (" r   rC   r;   c                 4    t          j        |          |d<   dS Initialize optimizer stater   Nr   
zeros_likerG   s      r   r?   zSGD.init_single      ]9--c


r   rN   c                    | j         dk    r|| j         |z  z  }| j        dk    r%|| j                            |j                  |z  z
  S | j        |                    d          z  }| j        dk    r|d| j        z
  |z  z  }n||z  }| j        r|| j        |z  z   }n|}||d<   || j                            |j                  |z  z
  S )zVPerforms the SGD parameter update and stores :math:`v` in the
        optimizer state.r   r   rJ   )r   r   rW   astypedtypegetr   r   )r!   rN   rC   r;   r   r'   s         r   rK   zSGD.apply_single  s     !!)I55H=At188HH8SSSMEIIcNN*>A!dn$00AAMA= 	 11FFFc
4-44X^DDvMMMr   )r   r   r   F)r\   r]   r^   r_   r   rb   r   r   r   boolr#   r9   r?   rK   r   r   s   @r   r   r      s         ( !! !UHbhZ-A$BBC! ! 	!
 ! ! ! ! ! ! !(.RX .d . . . .NRX N"( N4 N N N N N N N Nr   r   c                        e Zd ZdZ	 	 ddeeeej        gej        f         f         dedef fdZ	dej        d	e
fd
Zdej        dej        d	e
fdZ xZS )RMSpropag  The RMSprop optimizer [1].

    [1]: Tieleman, T. and Hinton, G. 2012. Lecture 6.5-rmsprop, coursera: Neural networks for machine learning

    .. math::

        v_{t+1} &= \alpha v_t + (1 - \alpha) g_t^2 \\
        w_{t+1} &= w_t - \lambda \frac{g_t}{\sqrt{v_{t+1}} + \epsilon}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        alpha (float, optional): The smoothing constant :math:`\alpha`.
          Default: ``0.99``
        eps (float, optional): The term :math:`\epsilon` added to the denominator
          to improve numerical stability. Default: ``1e-8``
    Gz?:0yE>rW   alphaepsc                    t                                                       |                     d|           || _        || _        | j        dk     rt          d| j         d          | j        dk     rt          d| j         d          d S )NrW   r   zRMSprop alpha should be >=0,  was provided insteadzRMSprop epsilon should be >0, )rk   r#   r[   r   r   rl   )r!   rW   r   r   rn   s       r   r#   zRMSprop.__init__;  s     	_m<<<
:Q
QQQ   8c>>PPPP   >r   rC   r;   c                 4    t          j        |          |d<   dS r   r   rG   s      r   r?   zRMSprop.init_singleP  r   r   rN   c                     | j                             |j                  }| j        }| j        }|d         }||z  d|z
  t          j        |          z  z   }||d<   |||z  t          j        |          |z   z  z
  S )zRPerforms the RMSprop parameter update and stores :math:`v` in the optimizer state.r   rJ   )rW   r   r   r   r   r   squaresqrt)r!   rN   rC   r;   lrr   r   r   s           r   rK   zRMSprop.apply_singleT  s|    &&x~66
h#JAIUbi&9&999c
2=BGAJJ,<===r   )r   r   r\   r]   r^   r_   r   rb   r   r   r   r#   r9   r?   rK   r   r   s   @r   r   r   )  s         ( 	 UHbhZ-A$BBC  	     *.RX .d . . . .
>RX 
>"( 
>4 
> 
> 
> 
> 
> 
> 
> 
>r   r   c                        e Zd ZdZ	 ddeeeej        gej        f         f         def fdZ	dej        de
fdZd	ej        dej        de
fd
Z xZS )AdagradaX  The Adagrad optimizer [1].

    Our Adagrad implementation follows the original paper. In detail,

    [1]: Duchi, J., Hazan, E. and Singer, Y., 2011. Adaptive subgradient methods
    for online learning and stochastic optimization. JMLR 2011.

    .. math::

        v_{t+1} &= v_t + g_t^2 \\
        w_{t+1} &= w_t - \lambda \frac{g_t}{\sqrt{v_{t+1}} + \epsilon}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        eps (float, optional): The term :math:`\epsilon` added to the
          denominator to improve numerical stability. Default: ``1e-8``
    r   rW   r   c                     t                                                       |                     d|           || _        | j        dk     rt	          d| j         d          d S )NrW   r   zAdagrad epsilon should be >0, r   )rk   r#   r[   r   rl   )r!   rW   r   rn   s      r   r#   zAdagrad.__init__t  sl    
 	_m<<<8c>>PPPP   >r   rC   r;   c                 4    t          j        |          |d<   dS r   r   rG   s      r   r?   zAdagrad.init_single  r   r   rN   c                     | j                             |j                  }| j        }|d         t	          j        |          z   }||d<   |||z  t	          j        |          |z   z  z
  S )zZPerforms the Adagrad parameter update and stores :math:`v` in the
        optimizer state.r   )rW   r   r   r   r   r   r   )r!   rN   rC   r;   r   r   r   s          r   rK   zAdagrad.apply_single  se     &&x~66h#J8,,,c
2=BGAJJ,<===r   )r   r   r   s   @r   r   r   a  s         *  UHbhZ-A$BBC      .RX .d . . . .	>RX 	>"( 	>4 	> 	> 	> 	> 	> 	> 	> 	>r   r   c                        e Zd ZdZ	 	 ddeeeej        gej        f         f         dedef fdZ	dej        d	e
fd
Zdej        dej        d	e
fdZ xZS )AdaDeltaah  The AdaDelta optimizer with a learning rate [1].

    Our AdaDelta implementation follows the original paper. In detail,

    [1]: Zeiler, M.D., 2012. ADADELTA: an adaptive learning rate method. arXiv preprint arXiv:1212.5701.

    .. math::

        v_{t+1} &= \rho v_t + (1 - \rho) g_t^2 \\
        \Delta w_{t+1} &= \frac{\sqrt{u_t + \epsilon}}{\sqrt{v_{t+1} + \epsilon}} g_t \\
        u_{t+1} &= \rho u_t + (1 - \rho) \Delta w_{t+1}^2 \\
        w_{t+1} &= w_t - \lambda \Delta w_{t+1}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        rho (float, optional): The coefficient :math:`\rho` used for computing a
            running average of squared gradients. Default: ``0.9``
        eps (float, optional): The term :math:`\epsilon` added to the denominator to improve
          numerical stability. Default: `1e-8`
    ?ư>rW   rhor   c                    t                                                       |                     d|           || _        || _        | j        dk     rt          d| j         d          | j        dk     rt          d| j         d          d S )NrW   r   zAdaDelta rho should be >=0, r   zAdaDelta epsilon should be >0, )rk   r#   r[   r   r   rl   )r!   rW   r   r   rn   s       r   r#   zAdaDelta.__init__  s     	_m<<<8c>>NtxNNN   8c>>Q$(QQQ   >r   rC   r;   c                 b    t          j        |          |d<   t          j        |          |d<   dS )r   r   uNr   rG   s      r   r?   zAdaDelta.init_single  ,    ]9--c
]9--c


r   rN   c                    | j                             |j                  }| j        }| j        }|d         }|d         }||z  d|z
  t          j        |          z  z   }t          j        ||z             t          j        ||z             z  |z  }	||z  d|z
  t          j        |	          z  z   }||d<   ||d<   |||	z  z
  S )ziPerforms the AdaDelta parameter update and stores :math:`v` and
        :math:`u` in the optimizer state.r   r   rJ   )rW   r   r   r   r   r   r   r   )
r!   rN   rC   r;   r   r   r   r   r   ds
             r   rK   zAdaDelta.apply_single  s     &&x~66hh#J#J!Gq3w")H"5"555GAGrwq3w///(:!Gq3w")A,,..c
c
26!!r   )r   r   r   r   s   @r   r   r     s         0 	 UHbhZ-A$BBC  	     (.RX .d . . . .
"RX ""( "4 " " " " " " " "r   r   c            	            e Zd ZdZddgddfdeeeej        gej        f         f         de	e         ded	e
f fd
Zdej        defdZdej        dej        defdZ xZS )Adama  The Adam optimizer [1]. In detail,

    [1]: Kingma, D.P. and Ba, J., 2015. Adam: A method for stochastic
    optimization. ICLR 2015.

    .. math::

        m_{t+1} &= \beta_1 m_t + (1 - \beta_1) g_t \\
        v_{t+1} &= \beta_2 v_t + (1 - \beta_2) g_t^2 \\
        w_{t+1} &= w_t - \lambda \frac{m_{t+1}}{\sqrt{v_{t+1}} + \epsilon}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        betas (Tuple[float, float], optional): The coefficients
          :math:`(\beta_1, \beta_2)` used for computing running averages of the
          gradient and its square. Default: ``(0.9, 0.999)``
        eps (float, optional): The term :math:`\epsilon` added to the
          denominator to improve numerical stability. Default: ``1e-8``
        bias_correction (bool, optional): If set to ``True``, bias correction
          is applied. Default: ``False``
    r   +?r   FrW   betasr   bias_correctionc                     t                                                       |                     d|           || _        || _        || _        d S rV   )rk   r#   r[   r   r   r   )r!   rW   r   r   r   rn   s        r   r#   zAdam.__init__  sM     	_m<<<
.r   rC   r;   c                 b    t          j        |          |d<   t          j        |          |d<   dS r   mr   Nr   rG   s      r   r?   zAdam.init_single  r   r   rN   c                 J   | j                             |j                  }| j        \  }}| j        }| j        }| j        }	|d         }
|d         }||
z  d|z
  |z  z   }
||z  d|z
  t          j        |          z  z   }|
|d<   ||d<   |r||d||	z  z
  z                      |j                  }t          j	        d||	z  z
                                |j                  }||
z  }t          j
        |          |z  |z   }|||z  z
  S |||
z  t          j
        |          |z   z  z
  S )zePerforms the Adam parameter update and stores :math:`v` and
        :math:`m` in the optimizer state.r   r   rJ   )rW   r   r   r   r   r   r   r   r   rsqrtr   )r!   rN   rC   r;   r   b1b2r   r   r   r   r   c1c2	numeratordenominators                   r   rK   zAdam.apply_single   s:    &&x~66Bh.y#J#JFa"f((Fa"f	( 3 333c
c
 	;BH%--hn==B!b$h,''..x~>>BQI'!**r/C/Ky;666rAvc)9:::r   )r\   r]   r^   r_   r   rb   r   r   r   r   r   r#   r9   r?   rK   r   r   s   @r   r   r     s         2 "5\ %/ /UHbhZ-A$BBC/ E{/ 	/
 / / / / / /.RX .d . . . .
;RX ;"( ;4 ; ; ; ; ; ; ; ;r   r   c                        e Zd ZdZddgdddfdeeeej        gej        f         f         de	e         d	ed
ede
f
 fdZdej        dej        def fdZ xZS )AdamWa2  The AdamW optimizer [1]. We update the weights with a weight_decay
    (:math:`\lambda`) value:

    [1]: Loshchilov, I. and Hutter, F., 2019. Decoupled weight decay
    regularization. ICLR 2019.

    .. math::

        m_{t+1} &= \beta_1 m_t + (1 - \beta_1) g_t \\
        v_{t+1} &= \beta_2 v_t + (1 - \beta_2) g_t^2 \\
        w_{t+1} &= w_t - \alpha (\frac{m_{t+1}}{\sqrt{v_{t+1}} + \epsilon} + \lambda w_t)

    Args:
        learning_rate (float or callable): The learning rate :math:`\alpha`.
        betas (Tuple[float, float], optional): The coefficients
          :math:`(\beta_1, \beta_2)` used for computing running averages of the
          gradient and its square. Default: ``(0.9, 0.999)``
        eps (float, optional): The term :math:`\epsilon` added to the
          denominator to improve numerical stability. Default: ``1e-8``
        weight_decay (float, optional): The weight decay :math:`\lambda`.
          Default: ``0.01``.
        bias_correction (bool, optional): If set to ``True``, bias correction
          is applied. Default: ``False``
    r   r   r   {Gz?FrW   r   r   r   r   c                 `    t                                          ||||           || _        d S )N)rW   r   r   r   )rk   r#   r   )r!   rW   r   r   r   r   rn   s         r   r#   zAdamW.__init__4  sB     	'+	 	 	
 	
 	
 )r   rN   rC   r;   c                     | j                             |j                  }t                                          ||d|| j        z  z
  z  |          S )zbPerforms the AdamW parameter update by modifying the parameters
        passed into Adam.
        rJ   )rW   r   r   rk   rK   r   )r!   rN   rC   r;   r   rn   s        r   rK   zAdamW.apply_singleD  sR    
 &&x~66ww##i1rD,='=#=>
 
 	
r   )r\   r]   r^   r_   r   rb   r   r   r   r   r   r#   r9   rK   r   r   s   @r   r   r     s         8 "5\" %) )UHbhZ-A$BBC) E{) 	)
 ) ) ) ) ) ) ) 
RX 
"( 
4 
 
 
 
 
 
 
 
 
 
r   r   c                        e Zd ZdZddgdfdeeeej        gej        f         f         de	e         def fdZ
d	ej        d
efdZdej        d	ej        d
efdZ xZS )Adamaxa  The Adamax optimizer, a variant of Adam based on the infinity norm [1].

    Our Adam implementation follows the original paper and omits the bias
    correction in the first and second moment estimates. In detail,

    [1]: Kingma, D.P. and Ba, J., 2015. Adam: A method for stochastic
    optimization. ICLR 2015.

    .. math::

        m_{t+1} &= \beta_1 m_t + (1 - \beta_1) g_t \\
        v_{t+1} &= \max(\beta_2 v_t, |g_t|) \\
        w_{t+1} &= w_t - \lambda \frac{m_{t+1}}{v_{t+1} + \epsilon}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        betas (Tuple[float, float], optional): The coefficients
          :math:`(\beta_1, \beta_2)` used for computing running averages of the
          gradient and its square. Default: ``(0.9, 0.999)``
        eps (float, optional): The term :math:`\epsilon` added to the
          denominator to improve numerical stability. Default: ``1e-8``
    r   r   r   rW   r   r   c                     t                                          |||           d|k    st          d| j         d          d S )Nr   zEpsilon value should be >=0, r   )rk   r#   rl   r   )r!   rW   r   r   rn   s       r   r#   zAdamax.__init__g  sS     	s333czzOOOO   zr   rC   r;   c                 b    t          j        |          |d<   t          j        |          |d<   dS r   r   rG   s      r   r?   zAdamax.init_singles  r   r   rN   c                 "   | j                             |j                  }| j        \  }}| j        }|d         }|d         }	||z  d|z
  |z  z   }t          j        ||	z  t          j        |                    }	||d<   |	|d<   |||z  |	|z   z  z
  S )zgPerforms the Adamax parameter update and stores :math:`v` and
        :math:`m` in the optimizer state.r   r   rJ   )rW   r   r   r   r   r   maximumabs)
r!   rN   rC   r;   r   r   r   r   r   r   s
             r   rK   zAdamax.apply_singlex  s     &&x~66Bh#J#JFa"f((JrAvrvh//00c
c
26QW---r   r\   r]   r^   r_   r   rb   r   r   r   r   r#   r9   r?   rK   r   r   s   @r   r   r   O  s         4 "5\	
 
UHbhZ-A$BBC
 E{
 	
 
 
 
 
 
.RX .d . . . .
.RX ."( .4 . . . . . . . .r   r   c                        e Zd ZdZddgdfdeeeej        gej        f         f         de	e         def fdZ
d	ej        d
efdZdej        d	ej        d
efdZ xZS )Liona&  The Lion optimizer [1].

    Since updates are computed through the sign operation, they tend to
    have larger norm than for other optimizers such as SGD and Adam.
    We recommend a learning rate that is 3-10x smaller than AdamW and a
    weight decay 3-10x larger than AdamW to maintain the strength
    (lr * wd). Our Lion implementation follows the original paper. In
    detail,

    [1]: Chen, X. Symbolic Discovery of Optimization Algorithms. arXiv
    preprint arXiv:2302.06675.

    .. math::

        c_{t + 1} &= \beta_1 m_t + (1 - \beta_1) g_t \\
        m_{t + 1} &= \beta_2 m_t + (1 - \beta_2) g_t \\
        w_{t + 1} &= w_t - \eta (\text{sign}(c_t) + \lambda w_t)

    Args:
        learning_rate (float or callable): The learning rate :math:`\eta`.
        betas (Tuple[float, float], optional): The coefficients
          :math:`(\beta_1, \beta_2)` used for computing the gradient
          momentum and update direction. Default: ``(0.9, 0.99)``
        weight_decay (float, optional): The weight decay :math:`\lambda`. Default: ``0.0``
    r   r   r   rW   r   r   c                     t                                                       |                     d|           || _        || _        d S rV   )rk   r#   r[   r   r   )r!   rW   r   r   rn   s       r   r#   zLion.__init__  sF     	_m<<<
(r   rC   r;   c                 4    t          j        |          |d<   dS )r   r   Nr   rG   s      r   r?   zLion.init_single  r   r   rN   c                    | j                             |j                  }| j        \  }}| j        }|d         }||z  d|z
  |z  z   }	||z  d|z
  |z  z   |d<   |dk    rd||z  z
  |z  }||t          j        |	          z  z
  S )zWPerforms the Lion parameter update and stores :math:`m`
        in the optimizer state.r   rJ   r   )rW   r   r   r   r   r   sign)
r!   rN   rC   r;   r   r   r   r   r   cs
             r   rK   zLion.apply_single  s     &&x~66B(#JFa"f((!Vq2v11c
!R,..);I2

?**r   r   r   s   @r   r   r     s         : "4[!	
) 
)UHbhZ-A$BBC
) E{
) 	
) 
) 
) 
) 
) 
).RX .d . . . .+RX +"( +4 + + + + + + + +r   r   c                       e Zd ZdZ	 	 	 	 	 	 	 	 	 dd	eeeej        gej        f         df         d
e	eef         dedede
e         dedededef fdZdej        defdZd Zd Zd Zdej        dej        defdZ xZS )	Adafactora  The Adafactor optimizer.

    Our Adafactor implementation follows the original paper: `Adafactor:
    Adaptive Learning Rates with Sublinear Memory Cost
    <https://arxiv.org/abs/1804.04235>`_

    Args:
        learning_rate (float or callable, optional): The learning rate.
            Default: ``None``.
        eps (tuple(float, float), optional): The first term :math:`\epsilon_1`
            added to the square of the gradients to improve numerical
            stability and the second term :math:`\epsilon_2` is used for
            parameter scaling if ``parameter_scale`` is set to ``True``.
            Default: ``(1e-30, 1e-3)``.
        clip_threshold (float, optional): Clips the unscaled update at
            ``clip_threshold``. Default: ``1.0``.
        decay_rate (float, optional): Coefficient for the running average
            of the squared gradient. Default: ``-0.8``.
        beta_1 (float, optional): If set to a value bigger than zero
            then first moment will be used. Default: ``None``.
        weight_decay (float, optional): The weight decay :math:`\lambda`.
            Default: ``0.0``.
        scale_parameter (bool, optional): If set to ``True`` the learning rate
            will be scaled by :math:`\max(\epsilon_1, \text{RMS}(w_{t-1}))`.
            Default: ``True``.
        relative_step (bool, optional): If set to ``True`` the ``learning_rate``
            will be ignored and relative step size will be computed.
            Default: ``True``.
        warmup_init (bool, optional): If set to ``True`` then the relative
            step size will be calculated by the current step. Default:
            ``False``.
    NgKH9gMbP?      ?皙r   TFrW   r   clip_threshold
decay_ratebeta_1r   scale_parameterrelative_stepwarmup_initc
                     t                                                       ||                     d|           || _        || _        || _        || _        || _        || _        || _	        |	| _
        d S rV   )rk   r#   r[   r   r   r   r   r   r   r   r   )r!   rW   r   r   r   r   r   r   r   r   rn   s             r   r#   zAdafactor.__init__  sz     	$  -@@@,$(.*&r   rC   r;   c                 B   |j         dk    r\|j        }|j        }t          j        |dd         |          |d<   t          j        |dd         |dd         z   |          |d<   nt          j        |          |d<   | j        t          j        |          |d	<   dS dS )
r      N)r   exp_avg_sq_rowexp_avg_sq_col
exp_avg_sqexp_avg)ndimshaper   r   zerosr   r   )r!   rC   r;   r   r   s        r   r?   zAdafactor.init_single  s    >QOEOE&(huSbSz&G&G&GE"#&(huSbSzE"##J/Fe&T&T&TE"##"$-	":":E,;"!}Y77E) #"r   c                 r    t          j        t          j        t          j        |                              S r-   )r   r   meanr   )r!   inputss     r   _compute_rmszAdafactor._compute_rms  s&    wrwry0011222r   c                    | j         r6| j        rd|z  nd}t          j        |t          j        |                    }n| j        }|                    |j                  }d}| j        r t          j	        | j
        d         |          }||z  S )Nr   r   r   rJ   )r   r   r   minimumr   rW   r   r   r   r   r   )r!   r   parameter_rmsmin_steprelative_step_sizeparameter_scales         r   _compute_learning_ratez Adafactor._compute_learning_rate  s     	4&*&6@td{{DH!#Hbhtnn!E!E!%!3/66}7JKK 	E j!mDDO!333r   c                     t          j        |t          j        |dd          z            }t          j        |          }t          j        t          j        |d          t          j        |d                    S )Nr   T)axiskeepdimsr  r   )r   r   r   matmulexpand_dims)r!   r   r   r_factorc_factors        r   _approximate_exp_moving_avgz%Adafactor._approximate_exp_moving_avg  st    8RW^"tLLLL
 
 8N++yN8"---r~hQ/O/O/O
 
 	
r   rN   c                 |   |j         dk    }| j        }| j        du}|                     |          }|                     ||          }d|| j        z                      |j                  z
  }	t          j	        |          | j
        d         z   }
|rz|d         }|d         }|	|z  d|	z
  t          j        |
d	          z  z   }|	|z  d|	z
  t          j        |
d
	          z  z   }||d<   ||d<   |                     ||          }
|
|z  }
n2|d         }|	|z  d|	z
  |
z  z   }||d<   t          j        |          |z  }
|
t          j        d|                     |
          | j        z            z  }
||
z  }
|r'|d         }| j        |z  d| j        z
  |
z  z   }||d<   |}
| j        dk    r||| j         |z  z  z  }||
z
  S )z2Performs the Adafactor parameter and state update.r   Nr   r   r   r   rJ   r   r  r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   )r!   rN   rC   r;   factoredr   use_first_momentr   rW   beta_2r'   r   r   r   r   s                  r   rK   zAdafactor.apply_single$  s"   =A%y;d2)))4433D-HHdo-55m6IJJJ8$$tx{2 	5"#34N"#34N$~5VrwvB7777N %~5VrwvB7777N '5E"#&4E"#55nnUUFh&FF|,J :-1v:2GHJ",E,Xj))H4F"*""6**T-@@
 
 
 ' 	I&G{W,!dk/V1KLG&E)F!!t'8&8=&HIII6!!r   )	Nr   r   r   Nr   TTF)r\   r]   r^   r_   r   rb   r   r   r   r   r   r   r#   r9   r?   r   r   r  rK   r   r   s   @r   r   r     sk        F MQ#0 # "&! $"!' 'UHbhZ-A$BDHI' 5%< ' 	'
 ' ' ' ' ' ' ' ' ' ' '08RX 8d 8 8 8 83 3 34 4 4
 
 
,"RX ,""( ,"4 ," ," ," ," ," ," ," ,"r   r   c                        e Zd ZdZ	 	 	 	 ddeeeej        gej        f         f         deded	e	d
e
f
 fdZdej        defdZde
fdZdej        dej        defdZ xZS )Muona%  The Muon optimizer.

    Our Muon (MomentUm Orthogonalized by Newton-schulz) optimizer follows the
    original implementation: `Muon: An optimizer for hidden layers in neural
    networks <https://kellerjordan.github.io/posts/muon/>`_

    Note:
        - Muon may be sub-optimal for the embedding layer, the final fully
          connected layer, or any 0D/1D parameters. Those should be optimized
          by a different method (e.g., :class:`AdamW`).
        - For 4D convolutional filters, it works by flattening their last
          dimensions.

    Args:
        learning_rate (float or callable): The learning rate.
        momentum (float, optional): The momentum strength. Default: ``0.95``
        weight_decay (float, optional): The weight decay (L2 penalty).
            Default: ``0.01``
        nesterov (bool, optional): Enables Nesterov momentum. Recommended for
            better performance.  Default: ``True``
        ns_steps (int, optional): Number of Newton-Schulz iteration steps for
            orthogonalization.  Default: ``5``
    ffffff?r   T   rW   r   r   r   ns_stepsc                     t                                                       |                     d|           || _        || _        || _        || _        d S rV   )rk   r#   r[   r   r   r   r  )r!   rW   r   r   r   r  rn   s         r   r#   zMuon.__init__l  sR     	_m<<< (  r   rC   r;   c                 4    t          j        |          |d<   dS r   r   rG   s      r   r?   zMuon.init_single|  r   r   stepsc                    |j         dk    sJ d|j         d            d\  }}}|j        d         |j        d         k    }|r|j        }|t          j                            |d          d	z   z  }t          |          D ]D}||j        z  }t          j        ||z  ||d
|          }	t          j        ||z  |	|d
d
          }E|r|j        }|S )Nr   z;Expected a 2D array for Newton-Schulz iteration, got shape z	 instead.)guV@ggn@ @r   r   T)r  gHz>r   )betar   )r   r   Tr   linalgnormr5   addmm)
r!   Xr  abr   transpose_neededr/   ABs
             r   _zeropower_via_newtonschulz5z!Muon._zeropower_via_newtonschulz5  s    FaKKK[[[[ KK+1a72;4 	AD11D89u 	; 	;AACAQ13a888AQ13c:::AA 	Ar   rN   c                 T   | j         dk    r|| j         |z  z   }| j        |d         z  }|d| j        z
  |z  z   }||d<   | j        r|d| j        z
  z  || j        z  z   }n|}| j                            |j                  }|j        dk    r|j        }|j        dk    }|r"t          j	        ||j        d         df          }| 
                    || j                  }|rt          j	        ||          }|t          d|j        d         |j        d         z            dz  z  }|||z  z
  S )	z"Performs the Muon parameter updater   r   rJ   r   r   )r  r   g      ?)r   r   r   rW   r   r   r   r   r   reshaper!  r  max)	r!   rN   rC   r;   r   r'   r   original_shapereshape_neededs	            r   rK   zMuon.apply_single  sK    !!$"3i"??HME#J&T]"h..c
= 	T]!23a$-6GGFFF&&x~66;!#\N#[1_N CFV\!_b,ABB66vT]6SSF <FN;;#ab)FL,<<==DDB2;&&r   )r  r   Tr  )r\   r]   r^   r_   r   rb   r   r   r   r   intr#   r9   r?   r!  rK   r   r   s   @r   r  r  S  s        6 "! !UHbhZ-A$BBC! ! 	!
 ! ! ! ! ! ! ! .RX .d . . . .S    *'RX '"( '4 ' ' ' ' ' ' ' 'r   r  c                     t          d | d          }t          j        |          }t          j        ||dz   z  d          t	          fd|           }||fS )a  Clips the global norm of the gradients.

    This function ensures that the global norm of the gradients does not exceed
    ``max_norm``. It scales down the gradients proportionally if their norm is
    greater than ``max_norm``.

    Example:
        >>> grads = {"w1": mx.array([2, 3]), "w2": mx.array([1])}
        >>> clipped_grads, total_norm = clip_grad_norm(grads, max_norm=2.0)
        >>> print(clipped_grads)
        {"w1": mx.array([...]), "w2": mx.array([...])}

    Args:
        grads (dict): A dictionary containing the gradient arrays.
        max_norm (float): The maximum allowed global norm of the gradients.

    Returns:
        (dict, float): The possibly rescaled gradients and the original
        gradient norm.
    c                 T    | |                                                                 z   S r-   )r   sum)accrw   s     r   r0   z clip_grad_norm.<locals>.<lambda>  s    cAHHJJNN4D4D.D r   r   r   r   c                     | z  S r-   r   )rw   
normalizers    r   r0   z clip_grad_norm.<locals>.<lambda>  s    q:~ r   )r   r   r   r   r
   )gradsmax_normnorm_squared
total_normclipped_gradsr-  s        @r   clip_grad_normr3    sh    * DDeSQQL&&JH
T(9:C@@J5555u==M*$$r   )typingr   r   r   r   r   mlx.corecorer   mlx.nnr   	mlx.utilsr	   r
   r   r   r   r   re   r   r   r   r   r   r   r   r   r   r  r3  r   r   r   <module>r9     s   : 9 9 9 9 9 9 9 9 9 9 9 9 9             U U U U U U U U U U U U U UP% P% P% P% P% P% P% P%fF, F, F, F, F,Y F, F, F,R@N @N @N @N @N) @N @N @NF5> 5> 5> 5> 5>i 5> 5> 5>p/> /> /> /> />i /> /> />d@" @" @" @" @"y @" @" @"FA; A; A; A; A;9 A; A; A;H2
 2
 2
 2
 2
D 2
 2
 2
j8. 8. 8. 8. 8.T 8. 8. 8.v7+ 7+ 7+ 7+ 7+9 7+ 7+ 7+tL" L" L" L" L"	 L" L" L"^a' a' a' a' a'9 a' a' a'H% % % % %r   