
    |jb                       d dl mZ d dlZd dlmZ d dlZd dlmZmZ d dlm	Z	 d dl
mZ ddlmZmZ d	d
lmZ er8d dlmZ d dlmZ d dlmZ d dlmZ d dl
mZ d	dlmZ d	dlmZ  G d de          Zg Z G d de          ZdS )    )annotationsN)TYPE_CHECKING)_C_opspir)in_dynamic_or_pir_mode)L2Decay   )core	framework   )	Optimizer)Sequence)NotRequired)Tensor)GradientClipBase)WeightDecayRegularizer)LRScheduler)_ParameterConfigc                  B    e Zd ZU ded<   ded<   ded<   ded<   ded<   d	S )
_MomentumParameterConfigzNotRequired[float]momentumzNotRequired[bool]use_nesterovrescale_gradzNotRequired[str]regularization_methodregularization_coeffN)__name__
__module____qualname____annotations__     i/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/optimizer/momentum.pyr   r   (   sN         $$$$''''((((////000000r!   r   c                  n     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 d#d$ fdZd Zd Zd% fd	Zd Z	d  Z
d! Zd" Z xZS )&Momentuma  

    Simple Momentum optimizer with velocity state

    This optimizer has a flag for Nestrov Momentum.

    The update equations are as follows:

    .. math::

        & velocity = mu * velocity + gradient

        & if (use\_nesterov):

        &\quad   param = param - (gradient + mu * velocity) * learning\_rate

        & else:

        &\quad   param = param - learning\_rate * velocity

    Parameters:

        learning_rate (float|Tensor|LRScheduler, optional): The learning rate used to update ``Parameter``.
            It can be a float value, a ``Tensor`` with a float type or a LRScheduler. The default value is 0.001.
        momentum (float): Momentum factor. The default value is 0.9.
        parameters (list|tuple|None, optional): List|Tuple of ``Tensor`` to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in parameter groups \
            represents the scale of base learning_rate. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        use_nesterov(bool, optional): Enables Nesterov momentum. The default value is False.
        weight_decay (int|float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
            It can be a int or float value as coeff of L2 regularization or \
            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
            Often choose to be ``1.0/batch_size``.
        use_multi_tensor (bool, optional): Whether to use multi-tensor strategy to update all parameters at once . Default is false.
        name (str|None, optional): The default value is None. Normally there is no need for user
                to set this property. For more information, please refer to
                :ref:`api_guide_Name` .

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
            >>> linear = paddle.nn.Linear(10, 10)
            >>> inp = paddle.to_tensor(inp)
            >>> out = linear(inp)
            >>> loss = paddle.mean(out)
            >>> momentum = paddle.optimizer.Momentum(
            ...     learning_rate=0.1,
            ...     parameters=linear.parameters(),
            ...     weight_decay=0.01
            ... )
            >>> back = out.backward()
            >>> momentum.step()
            >>> momentum.clear_grad()

            >>> # Note that the learning_rate of linear_2 is 0.01.
            >>> linear_1 = paddle.nn.Linear(10, 10)
            >>> linear_2 = paddle.nn.Linear(10, 10)
            >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
            >>> out = linear_1(inp)
            >>> out = linear_2(out)
            >>> loss = paddle.mean(out)
            >>> momentum = paddle.optimizer.Momentum(
            ...     learning_rate=0.1,
            ...     parameters=[{ # type: ignore
            ...         'params': linear_1.parameters()
            ...     }, {
            ...         'params': linear_2.parameters(),
            ...         'weight_decay': 0.001,
            ...         'learning_rate': 0.1
            ...     }],
            ...     weight_decay=0.01,
            ...     momentum=0.9
            ... )
            >>> out.backward()
            >>> momentum.step()
            >>> momentum.clear_grad()

    velocityMbP??NF      ?learning_ratefloat | Tensor | LRSchedulerr   float
parameters<Sequence[Tensor] | Sequence[_MomentumParameterConfig] | Noner   boolweight_decay%float | WeightDecayRegularizer | None	grad_clipGradientClipBase | Nonemulti_precisionr   use_multi_tensorname
str | NonereturnNonec                   |t          d          |t          d          t          |t                    rt          |          }d }t          |t                    rdt          |d         t
                    rI|D ]F}d|v r|d         n|}|                     |          \  }}||d<   ||d<    ||          rd n|}||d<   G ||          rd n|}t                                          |||||
           d	| _	        || _
        t          |          | _        |                     |          \  | _        | _        || _        || _        i | _        |||| j        | j        d
| _        |	| _        | j        r|                                 | _        |                                 | _        |                                 | _        d | j        d<   |                                 | _        |                                 | _        d S d S )Nzlearning_rate is not setzmomentum is not setc                :    t          | t          t          f          S N)
isinstancer   r+   )regulars    r"   <lambda>z#Momentum.__init__.<locals>.<lambda>   s    Jw%8H$I$I r!   r   r/   r   r   )r)   r,   r/   r1   r5   r   )r   r   r   r   r   FP32_DenseTensor)
ValueErrorr<   intr+   listdict_update_regularizationsuper__init__type	_momentumr.   _use_nesterov_regularization_method_regularization_coeff_multi_precision_rescale_grad_master_weights_default_dict_use_multi_tensor_create_multi_tensor_dict_param_dict_velocity_dict_master_weight_dict_regularization_method_dict_regularization_coeff_dict)selfr)   r   r,   r   r/   r1   r3   r   r4   r5   	predicateparam_groupdecay
reg_method	reg_coeff
py_regular	__class__s                    r"   rF   zMomentum.__init__   sh     78882333lC(( 	/ ..LII	j$'' 	=*Q-.. =#- 
= 
=K *[88 $N33) 
 -1,G,G,N,N)J	;EK 78:CK 67)25)9)9!DuJ2<K//&Y|44FTT,
'!# 	 	
 	
 	
 	!!,// ''55	
'& /)! !((%)%@$($>
 
 "2! 	O#==??D"&"@"@"B"BD'+'E'E'G'GD$;?D$%78/3/M/M/O/OD,.2.L.L.N.ND+++	O 	Or!   c                    d}d}t          |t                    r	d}|j        }t          |t                    rd}|}||fS )N         l2_decay)r<   r   _coeffr+   )rW   r/   r[   r\   s       r"   rD   zMomentum._update_regularization   sR    
	lG,, 	,#J$+IlE** 	%#J$I9$$r!   c                   t          |t          j        t          j        j        f          sJ t          |t
                    r|                     |          }|D ]}|j        | j        v r| j	        rj| 
                    |j                  rP|                     |          }|                     | j        |           | j                            |j                   | 
                    |j                  r| j	        st!          j        d           |                     | j        |           | j                            |j                   dS )zD
        if framework.in_dynamic_mode():
            return
        zAccumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Momentum optimizer.N)r<   r   Blockpaddler   rC   _update_param_groupr5   _already_create_accumulatorrL   _is_dtype_fp16_or_bf16dtype_create_master_weight_add_accumulator_velocity_acc_straddwarningswarn)rW   blockr,   pmaster_ps        r"   _create_accumulatorszMomentum._create_accumulators   sW   
 %)/6:3C!DEEEEEj$'' 	>11*==J 	9 	9Av999$ )D)DQW)M)M 55a88%%d&<hGGG044QV<<<++AG44- \   !!$"8!<<<,008888#	9 	9r!   c                    t          |d          rt          |j        t                    r|S t	                                          |||          S )zpCreate and add backward regularization Operators

        Function helper of append_regularization_ops.
        regularizer)hasattrr<   rv   r   rE   _create_regularization_of_grad)rW   paramgradregularizationr^   s       r"   rx   z'Momentum._create_regularization_of_grad  sZ     5-(( 	Zw.
 .
 	 Kww554
 
 	
r!   c                &   t          |t          j        t          j        f          st	          d          t          |t
                    r|                     |          }|                     | j        |d                   }| 	                    |          }|d         }| j
        }| j        }t          |d          r4t          |j        t                    rd}|j        j        }n|j        d}d}| j        o|                     |d         j                  }|r| j        |d         j                 nd }	t+                      rit          |t
                    r|                     |d                    t/          j        |d         |d         |||	| j        | j        |||| j                  S | j        | j        |||| j        d	}
|d         g|d         g|g|gd
}|d         g|gd}|r
|	|d<   |	|d<   |                    | j        |||
d          }|S )Nzblock is not instance of Block.r   rv   rb   r`   ra   r/   r   )mur   r   r   r3   r   ParamGradVelocityLearningRateParamOutVelocityOutMasterParamMasterParamOutTrG   inputsoutputsattrsstop_gradient)r<   r   re   r   	TypeErrorrC   rg   _get_accumulator_masterrm   _create_param_lrrJ   rK   rw   rv   r   rc   rL   ri   rj   rN   r5   r   rD   r   	momentum_rH   rI   rM   	append_oprG   )rW   rq   param_and_gradvelocity_acclrry   r   r   find_mastermaster_weightr   r   r   momentum_ops                 r"   _append_optimize_opzMomentum._append_optimize_op  s   %)/39!=>> 	?=>>>nd++ 	F!55nEEN33"N1$5
 
 "">22 q! $ ;#95-(( 	+%+W55 +(2%','8'?$$".(*%'*$+ 
0K0K1#1
 1

 D !2!788 	 "## 3	.$// L++N>,JKKK#q!q!"%$"   n $ 2)>(<#. $ 2 E )+,'*+)N!#	 F ,A./ ,~ G
  :(5}%,9()  //Y" *  K r!   c                   |                      ||           |D ].}|                     | j        |          }| j        }| j        }t          |d          r4t          |j        t                    rd}|j        j	        }n|j        d}d}|j
        t          j        k    r| j        d         |                             |           | j        d         |                             |           | j        d         |                             |           | j        d         |                             |           |                     |j
                  r| j        d         |                             |           | j        d         |                             |           | j        r7| j        d         |                             | j        |j                            nd| j        d         |<   | j        d         |                             |           | j        d         |                             |           !t/          d          dS )	a  
        All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, bf16, float32).
        This function will be overridden in the corresponding optimizer file.

        Args:
            target_block: the block in which the loss tensor is present
            parameters: list of parameter tensors for the optimizer
        rv   rb   Nr`   ra   r?   FP16_DenseTensorz^Now multi_tensor_momentum only support fp32, fp16 or bf16 parameters and grad is DENSE_TENSOR.)rt   r   rm   rJ   rK   rw   r<   rv   r   rc   rj   rf   float32rR   appendrS   rU   rV   ri   rL   rT   rN   r5   r@   )rW   target_blockr,   param_group_idxry   r   r   r   s           r"   _multi_tensor_initzMomentum._multi_tensor_initl  s    	!!,
;;; 4	 4	E77& L %)$?!#'#= um,, /e/99 /,6)+0+<+C((&2,.)+.({fn,, !34_ELL   #$67HOO    01CD#&.////0BC#&-....,,U[99  !34_ELL   #$67HOO    ( ,-?@'fT1%*=>>>>  ,-?@' 01CD#&.////0BC#&-.... t  e4	 4	r!   c                   t          |t          j                  sJ g g d}g g d}t          |t                    rJ|D ]D}|d         |d         j        du r(|d         j        t          j        k    rw|d         j        t          j
        j        j        k    rR|d                             |d                    |                     |          }|d                             |           |                     |d         j                  rv|d         j        t          j
        j        j        k    rQ|d                             |d                    |                     |          }|d                             |           Fn|d         D ]}|d         |d         j        du rui }||d<   |                    d	 |                                D                        |                     |          }|d         j        t          j        k    rw|d         j        t          j
        j        j        k    rR|d                             |d                    |                     |          }|d                             |           |                     |d         j                  rv|d         j        t          j
        j        j        k    rQ|d                             |d                    |                     |          }|d                             |           ddg}	|	D ]`}
t'          | j        |
         |                   dk    r8| j        o|
dk    }| j        |
         }|||         nd}t/                      r'|                     d
          }|rGt          |t          j        j        t          j        j        f          r|                     d
d           t          |t          j        j        t          j        j        f          r|                     d
d           t=          j        | j        |
         |         ||
         | j         |
         |         ||
         || j!        | j"        | j#        |
         |         | j$        |
         |         || j%                  \  }}}| j        |
         |         ||
         | j         |
         |         ||
         d}| j        |
         |         | j         |
         |         d}| j!        | j"        | j#        |
         |         | j$        |
         |         d}|r1| j        |
         |         |d<   | j        |
         |         |d<   ||d<   |&                    d|||d           bdS )zM
        For Multi Tensor, append optimize merged_operator to block.
        )r?   r   r   Nr   Fr?   r   paramsc                &    i | ]\  }}|d k    ||S )r   r    ).0kvs      r"   
<dictcomp>z=Momentum._append_optimize_multi_tensor_op.<locals>.<dictcomp>  s/        $1 H}} q,}}r!   	found_infTr~   r   )r}   r   r   r   r   r   r3   merged_momentumr   )'r<   r   re   rB   r   rj   rf   r   rG   r
   VarDescVarTypeDENSE_TENSORr   r   ri   updateitemsrg   lenrR   rL   rT   r   _get_auxiliary_vareagerr   r   Value_set_auxiliary_varr   merged_momentum_rS   rH   rI   rU   rV   rM   r   )rW   r   parameters_and_gradsr   	grad_dictlr_dictr   r   param_grad_dictmulti_tensor_listkeyr   r   r   _r   r   r   s                     r"    _append_optimize_multi_tensor_opz)Momentum._append_optimize_multi_tensor_op  s1    ,	88888)+DD	')rBB*D11 3	?"6 ? ?!!$,!!$2e;;&q)/6>AA*1-2</<= = ""45<<^A=NOOO!22>BB 23::2>>>>33N14E4KLL?*1-2</<= = ""45<<^A=NOOO!22>BB 23::2>>>'?* #7x"@ ? ?!!$,!!$2e;;&(O0>OH-#** (<(B(B(D(D     &*%=%=o%N%NN&q)/6>AA*1-2</<= = ""45<<^A=NOOO!22>BB 23::2>>>>33N14E4KLL?*1-2</<= = ""45<<^A=NOOO!22>BB 23::2>>>/1CD$ O	 O	C4#C(9::Q>>)Gc5G.G  !% 8 = %0 "/22  *++ B $ 7 7 D DI  %%
(96:;K'L  G !33KFFF%%
(96:;K'L  H !33KGGG"("9 ,S1/B%cN /4_E#CL) N . <SA / !;C@ / ( .# #1aa& "&!1#!6!G )#$($7$<_$M(/	 F %)$4S$9/$J'+':3'?+( G #n(,(:151Q2)2+ 150O1)1+	 	E # ?040H0M+1}- 594L5)5+ 01 4?/0 **.% '#&* +   SO	 O	r!   c                   |                     d| j        d                   | _        |                     d| j        d                   | _        |                     d| j        d                   | _        |                     d| j        d                   | _        |                     d| j        d                   | _        |                     d          }|S )Nr   r   r   r   r   r   )getrO   rH   rI   rM   rJ   rK   )rW   r,   s     r"   rg   zMomentum._update_param_groupA  s    #*:6
 
 (^^D.~>
 
 (^^D.~>
 
 '1nn#T%78O%P'
 '
# &0^^"D$67M$N&
 &
"  ^^H--
r!   )
r&   r'   NFNNFr(   FN)r)   r*   r   r+   r,   r-   r   r.   r/   r0   r1   r2   r3   r.   r   r+   r4   r.   r5   r6   r7   r8   r;   )r   r   r   __doc__rm   rF   rD   rt   rx   r   r   r   rg   __classcell__)r^   s   @r"   r$   r$   3   s	       ] ]~ # 7< ">B-1 %!!&FO FO FO FO FO FO FOP
% 
% 
%9 9 9:
 
 
 
 
 
U U Un> > >@S S Sj      r!   r$   )
__future__r   ro   typingr   rf   r   r   paddle.frameworkr   paddle.regularizerr   baser
   r   	optimizerr   collections.abcr   typing_extensionsr   r   paddle.nn.clipr   r   r   r   r   r   __all__r$   r    r!   r"   <module>r      s   # " " " " "                       3 3 3 3 3 3 & & & & & & " " " " " " " "             1((((((------//////999999++++++1 1 1 1 1#3 1 1 1 _ _ _ _ _y _ _ _ _ _r!   