
    |j6                        d dl mZ d dlmZ d dlmZmZ d dlmZ ddl	m
Z
mZ ddlmZ dd	lmZ er2d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ ddlmZ  G d de          Zg Z G d de          ZdS )    )annotations)TYPE_CHECKING)_C_opspir)global_scope   )core	framework)Variable   )	Optimizer)Sequence)Callable)NotRequired)Tensor)GradientClipBase)_ParameterConfigc                  B    e Zd ZU ded<   ded<   ded<   ded<   ded<   d	S )
_LambParameterConfigzNotRequired[float | Tensor]beta1beta2epsilonzNotRequired[float]lamb_weight_decayz,NotRequired[Callable[[Tensor], bool] | None]exclude_from_weight_decay_fnN)__name__
__module____qualname____annotations__     e/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/optimizer/lamb.pyr   r   %   sZ         ********,,,,----	
 	
 	
 	
 	
 	
r    r   c                  l     e Zd ZdZdZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 d'd( fd!Zd)d"Z	d# Z
d$ Zd% Zd& Z xZS )*Lambu  
    LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.

    LAMB Optimizer is designed to scale up the batch size of training without losing
    accuracy, which supports adaptive element-wise updating and accurate layer-wise
    correction. For more information, please refer to `Large Batch Optimization for
    Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .

    The updating of parameters follows:

    ..  math::

        m_t &= \beta_1 m_{t - 1}+ (1 - \beta_1)g_t

        v_t &= \beta_2 v_{t - 1}  + (1 - \beta_2)g_t^2

        m_t &= \frac{m_t}{\beta_1^t}

        v_t &= \frac{v_t}{\beta_2^t}

        r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon}

        w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})


    where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
    learning rate, :math:`\\lambda` the LAMB weight decay rate.

    Args:
        learning_rate (float|Tensor, optional): the learning rate used to update parameters. \
            Can be a float value or a Variable with data type float32. Default 0.001.
        lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. Remind that weight_decay should be None.
        beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
            Default 0.9.
        beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
            Default 0.999.
        epsilon (float|Tensor, optional): A small float value for numerical stability. Default 1e-6.
        parameters (list|tuple|None, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in parameter groups \
            represents the scale of base learning_rate. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_base_clip_ClipGradByGlobalNorm` , :ref:`api_paddle_base_clip_ClipGradByNorm` ,
            :ref:`api_paddle_base_clip_ClipGradByValue` ). If you want better convergence, it is recommended
            to use :ref:`api_paddle_base_clip_ClipGradByGlobalNorm` . Default None, meaning there is no gradient clipping.
        exclude_from_weight_decay_fn (Callable|None, optional): whether to skip weight decay for a parameter when this function returns True while take the parameter as input.
        multi_precision (bool, optional) - Whether to use it during weight updates multi-precision, Default False。
        always_adapt (bool, optional): whether to use Layer-wise LR adaptation. By default, skip adaptation on parameters that are
            excluded from weight decay, unless always_adapt == True, then always enable LR adaptation.
        name(str|None, optional): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
    Examples:
        .. code-block:: python

            >>> import paddle

            >>> inp = paddle.uniform(shape=[10, 10], dtype='float32', min=-0.1, max=0.1)
            >>> linear = paddle.nn.Linear(10, 10)
            >>> out = linear(inp)
            >>> loss = paddle.mean(out)
            >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
            >>> beta2 = paddle.to_tensor([0.85], dtype="float32")
            >>> lamb = paddle.optimizer.Lamb(
            ...     learning_rate=0.002,
            ...     beta1=beta1,
            ...     beta2=beta2,
            ...     parameters=linear.parameters(),
            ...     lamb_weight_decay=0.01
            ... )
            >>> back = out.backward()
            >>> lamb.step()
            >>> lamb.clear_grad()

    moment1moment2beta1_pow_accbeta2_pow_accMbP?{Gz??+?ư>NFlearning_ratefloat | Tensorr   floatr   r   r   
parameters8Sequence[Tensor] | Sequence[_LambParameterConfig] | None	grad_clipGradientClipBase | Noner   Callable[[Tensor], bool] | Nonemulti_precisionboolalways_adaptname
str | NonereturnNonec                   |J |J |J |J t                                          ||d ||           d| _        || _        || _        || _        || _        || _        |||||d| _        i | _	        i | _
        |	| _        |
| _        d S )N)r-   r0   weight_decayr2   r8   lamb)r   r   r   r   r   )super__init__type_beta1_beta2_epsilon_lamb_weight_decay_exclude_from_weight_decay_fn_default_dict_master_weights_used_master_weights_multi_precisionr7   )selfr-   r   r   r   r   r0   r2   r   r5   r7   r8   	__class__s               r!   r@   zLamb.__init__   s      (((      """'! 	 	
 	
 	
 	"3-I*!2,H
 
  "$&! /(r    c                   |t                      }|                    |                                          }| j                            |          }||                    |                                          }|                                |                                k    sJ |                                |                                k    sJ nd }||fS N)r   find_var
get_tensorrI   get_dtypeshape)rK   r8   scopep_tmaster_name
master_p_ts         r!   _get_parameterzLamb._get_parameter   s    = NNEnnT""--///33D99"44??AAJ$$&&#**,,6666##%%44444JJr    c                *   t          |t          j        t          j        f          st	          d          t          |t
                    r|                     |          }|D ]}|j        | j        v r| j	        rd| 
                    |j                  rJ|                     |          }|                     |           | j                            |j                   ||                     |           | j                            |j                   d S )Nblock is not instance of Block.)
isinstancer
   Blockr   	TypeErrordict_update_param_groupr8   _already_create_accumulatorrJ   _is_dtype_fp16_or_bf16dtype_create_master_weight_add_moments_powsadd)rK   blockr0   pmaster_ps        r!   _create_accumulatorszLamb._create_accumulators   s   %)/39!=>> 	?=>>>j$'' 	>11*==J  		= 		=Av999$ =)D)DQW)M)M =55a88&&x000044QV<<<<&&q)))044QV<<<<		= 		=r    c           	     :   |j         }|                     |          rt          j        j        j        }|                     | j        ||           |                     | j        ||           |                     | j	        ||t          | j        t                    rdn| j        dgt          j        j        j        d           |                     | j        ||t          | j        t                    rdn| j        dgt          j        j        j        d           d S )N)rb   r*   r   cpu)r8   paramrb   
fill_valuerS   rA   devicer+   )rb   ra   r	   VarDescVarTypeFP32_add_accumulator_moment1_acc_str_moment2_acc_str_beta1_pow_acc_strr[   rB   r   DENSE_TENSOR_beta2_pow_acc_strrC   )rK   rg   	acc_dtypes      r!   rd   zLamb._add_moments_pows   s    G	&&y11 	2,1Id3QiHHHd3QiHHH(!$+x88Idk#%2 	 
	
 
	
 
	
 	(#DK::K#%2 	 
	
 
	
 
	
 
	
 
	
r    c                   t          |t          j        t          j        f          st	          d          t          |t
                    r|                     |          }d|j        _        | 	                    | j
        |d                   }| 	                    | j        |d                   }| 	                    | j        |d                   }| 	                    | j        |d                   }| j        |                     |d                   rd}n| j        }|                     |          }| j        o|                     |d         j                  }	|d         j        }
|	r| j        |
         }|j        | j        |
<   nd }t          j                    rDt1          j        |d         |d         ||||||d || j        | j        | j        | j        |	           d S |d         |d         |||||d}|d         ||||d}| j        | j        | j        || j        |	d}|	r
||d	<   ||d
<   |                     d          }|r||d<   |                    | j         |||d          }|S )NrZ   Tr   g        r   )ParamGradLearningRateMoment1Moment2Beta1PowBeta2Pow)ParamOut
Moment1Out
Moment2OutBeta1PowOutBeta2PowOut)r   r   r   r=   r7   r5   MasterParamMasterParamOut	found_inf
SkipUpdate)rA   inputsoutputsattrsstop_gradient)!r[   r
   r\   r   r]   r^   r_   program	_use_lamb_get_accumulator_masterrs   rt   ru   rw   rF   rE   _create_param_lrrJ   ra   rb   r8   rH   rI   in_dynamic_or_pir_moder   lamb_rB   rC   rD   r7   _get_auxiliary_var	append_oprA   )rK   rf   param_and_gradr$   r%   r&   r'   r=   lrfind_masterp_namemaster_weightr   r   r   r   lamb_ops                    r!   _append_optimize_opzLamb._append_optimize_op   s   %)/39!=>> 	?=>>>nd++ 	F!55nEEN"&..!>!#4
 
 ..!>!#4
 
 44#^A%6
 
 44#^A%6
 

 .:22>!3DEE ; LL2L"">22+ 
0K0K1#1
 1
  "' 	! 08M0=0BD%f-- M+-- >	Lq!q!!  " 4 (*&q) """)) F +1-%%,, G = , $ 1#. E  :(5}%,9()//<<I 1'0|$ooY" &  G Nr    c                   |                     d| j        d                   | _        |                     d| j        d                   | _        |                     d| j        d                   | _        |                     d| j        d                   | _        |                     d| j        d                   | _        |                     d          }|S )Nr   r   r   r   r   params)rQ   rG   rB   rC   rD   rE   rF   )rK   r0   s     r!   r_   zLamb._update_param_group[  s     nnWd.@.IJJ nnWd.@.IJJ"y$2DY2OPP",..!34G!H#
 #
 .8^^*=>.
 .
*  ^^H--
r    )r(   r)   r*   r+   r,   NNNFFN)r-   r.   r   r/   r   r.   r   r.   r   r.   r0   r1   r2   r3   r   r4   r5   r6   r7   r6   r8   r9   r:   r;   rN   )r   r   r   __doc__rs   rt   ru   rw   r@   rX   ri   rd   r   r_   __classcell__)rL   s   @r!   r#   r#   2   s        L L\ ! (( ).#' # %"& -1HL %",) ,) ,) ,) ,) ,) ,)\   = = =$
 
 
<f f fP      r    r#   N)
__future__r   typingr   paddler   r   paddle.base.executorr   baser	   r
   base.frameworkr   	optimizerr   collections.abcr   r   typing_extensionsr   r   paddle.nn.clipr   r   r   __all__r#   r   r    r!   <module>r      su   # " " " " "                     - - - - - - " " " " " " " " % % % % % %             
((((((------//////++++++
 
 
 
 
/ 
 
 
 u u u u u9 u u u u ur    