
    |j+                        d dl mZ d dlZd dlmZ d dlZd dlmZ d dlmZ ddl	m
Z
 ddlmZ d	d
lmZ er8d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d	dlmZ d	dlmZ  G d de          Zg Z G d de          ZdS )    )annotationsN)TYPE_CHECKING)_C_ops)in_dynamic_or_pir_mode   )	framework)no_grad   )	Optimizer)Sequence)NotRequired)Tensor)GradientClipBase)WeightDecayRegularizer)LRScheduler)_ParameterConfigc                  $    e Zd ZU ded<   ded<   dS )_AdadeltaParameterConfigzNotRequired[float]epsilonrhoN)__name__
__module____qualname____annotations__     i/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/optimizer/adadelta.pyr   r   '   s*         ####r   r   c                  Z     e Zd ZU dZded<   dZdZ	 	 	 	 	 	 	 dd fdZd Zd Z	d Z
 xZS )Adadeltaa  
    **Notes: This API does not support sparse parameter optimization.**

    Adadelta Optimizer. Please refer to this for details:
    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.

    The update is done as follows:

    .. math::

        E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2

        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) }

        E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2

    Args:
        learning_rate (float|Tensor|LRScheduler, optional): The learning rate used to update ``Parameter``.
            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
        rho (float): a floating point value indicating the decay rate. Default 0.95.
        parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in parameter groups \
            represents the scale of base learning_rate. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        weight_decay (int|float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
            It can be a int or float value as coeff of L2 regularization or \
            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        name (str|None, optional): The default value is None. Normally there is no need for user
                to set this property. For more information, please refer to
                :ref:`api_guide_Name` .

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
            >>> linear = paddle.nn.Linear(10, 10)
            >>> out = linear(inp)
            >>> loss = paddle.mean(out)
            >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
            >>> beta2 = paddle.to_tensor([0.99], dtype="float32")
            >>> adadelta = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
            >>> back = out.backward()
            >>> adadelta.step()
            >>> adadelta.clear_grad()

            >>> # Note that the learning_rate of linear_2 is 0.01.
            >>> linear_1 = paddle.nn.Linear(10, 10)
            >>> linear_2 = paddle.nn.Linear(10, 10)
            >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
            >>> out = linear_1(inp)
            >>> out = linear_2(out)
            >>> loss = paddle.mean(out)
            >>> adadelta = paddle.optimizer.Adadelta(
            ...     learning_rate=0.1,
            ...     parameters=[{  # type: ignore
            ...         'params': linear_1.parameters()
            ...     }, {
            ...         'params': linear_2.parameters(),
            ...         'weight_decay': 0.001,
            ...         'learning_rate': 0.1,
            ...     }],
            ...     weight_decay=0.01)
            >>> out.backward()
            >>> adadelta.step()
            >>> adadelta.clear_grad()

    strtype_avg_squared_grad_avg_squared_updateMbP?ư>ffffff?Nlearning_ratefloat | Tensor | LRSchedulerr   floatr   
parameters<Sequence[Tensor] | Sequence[_AdadeltaParameterConfig] | Noneweight_decay%float | WeightDecayRegularizer | None	grad_clipGradientClipBase | Nonename
str | NonereturnNonec                   |t          d          |t          d          |t          d          t                                          |||||           d| _        i | _        d| _        || _        || _        ||d| _        d S )Nzlearning_rate is not set.zepsilon is not set.zrho is not set.)r'   r*   r,   r.   r0   Fadadelta)r   r   )	
ValueErrorsuper__init___multi_precision_master_weightsr!   _epsilon_rho_default_dict)	selfr'   r   r   r*   r,   r.   r0   	__class__s	           r   r8   zAdadelta.__init__   s      8999?2333;.///'!% 	 	
 	
 	
 !&!		
 
r   c                &   t          |t          j        t          j        j        f          st          d          t          |t                    r|                    d          }|D ](}|j        | j	        v r| j
        r|                     |j                  rk|                     |          }|                     | j        |           |                     | j        |           | j	                            |j                   |                     |j                  r| j
        st%          j        d           |                     | j        |           |                     | j        |           | j	                            |j                   *d S )N)block is not instance of framework.Block.paramszAccumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Lars optimizer.)
isinstancer   Blockpaddlepir	TypeErrordictgetr0   _already_create_accumulatorr9   _is_dtype_fp16_or_bf16dtype_create_master_weight_add_accumulator_avg_squared_grad_acc_str_avg_squared_update_acc_straddwarningswarn)r>   blockr*   pmaster_ps        r   _create_accumulatorszAdadelta._create_accumulators   s   %)/6:3C!DEE 	IGHHHj$'' 	2#11J 	9 	9Av999$ )D)DQW)M)M 55a88%%d&DhOOO%%4h   044QV<<<++AG44- X   !!$"@!DDD!!$"BAFFF,008888+	9 	9r   c                   t          |t                    r|                     |          }|                     | j        |d                   }|                     | j        |d                   }| j        o|                     |d         j                  }|r| j	        |d         j
                 nd }t                      rmt                      5  t          j        |d         |d         |||                     |          || j        | j        |	  	         d d d            n# 1 swxY w Y   d S t          |t$          j        t(          j        j        f          st-          d          |d         |d         |||                     |          d}|d         ||d}|r
||d<   ||d<   |                    | j        ||| j        | j        |dd	
          }	|	S )Nr   r
   rA   )ParamGradAvgSquaredGradAvgSquaredUpdateLearningRate)ParamOutAvgSquaredGradOutAvgSquaredUpdateOutMasterParamMasterParamOut)r   r   multi_precisionT)r!   inputsoutputsattrsstop_gradient)rC   rH   _update_param_group_get_accumulator_masterrO   rP   r9   rK   rL   r:   r0   r   r	   r   	adadelta__create_param_lrr<   r;   r   rD   rE   rF   rG   	append_opr!   )
r>   rT   param_and_gradavg_squared_grad_accavg_squared_update_accfind_mastermaster_weightrd   re   adadelta_ops
             r   _append_optimize_opzAdadelta._append_optimize_op   sj   nd++ 	F!55nEEN#;;*N1,= 
  
 "&!=!=,nQ.?"
 "
 + 
0K0K1#1
 1

 D !2!788 	 "## .	   "1%"1%(*)).99!IM
 
 
               4eiovz7G%HII M KLLL (*&q)"6$: $ 5 5n E E F +1-%9'= G
  :(5}%,9()//Y#}9'2 
 # * 
 
K s   ADD"%D"c                    |                     d| j        d                   | _        |                     d| j        d                   | _        |                     d          }|S )Nr   r   rB   )rI   r=   r;   r<   )r>   r*   s     r   rh   zAdadelta._update_param_group  sS    "y$2DY2OPPNN5$*<U*CDD	^^H--
r   )r$   r%   r&   NNNN)r'   r(   r   r)   r   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   )r   r   r   __doc__r   rO   rP   r8   rW   rs   rh   __classcell__)r?   s   @r   r   r   /   s         O Ob III 3"7 7< >B-1!
 !
 !
 !
 !
 !
 !
F9 9 9:A A AF      r   r   )
__future__r   rR   typingr   rE   r   paddle.base.frameworkr   baser   base.dygraphr	   	optimizerr   collections.abcr   typing_extensionsr   r   paddle.nn.clipr   paddle.regularizerr   lrr   r   r   __all__r   r   r   r   <module>r      s   # " " " " "                     8 8 8 8 8 8       " " " " " "              ((((((------//////999999++++++         #3      
 ] ] ] ] ]y ] ] ] ] ]r   