
    |jf                   >   d dl mZ d dlZd dlZd dlmZ d dlmZ d dlZ	d dl
Z
d dlmZ d dl
mZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZmZmZmZmZmZ d d
lm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 er2d dl3m4Z4m5Z5 d dl6m7Z7m8Z8 d dl
m9Z9 d dl:m;Z; ddl)m<Z<m=Z=  G d de8          Z> e/e?ej@        d          ZAg ZB eCejD        E                    dd                     ZFe#jG        	 	 	 	 	 d d            ZH G d d          ZIdS )!    )annotationsN)defaultdict)TYPE_CHECKING)_C_ops)	parameterset_parameter)	ValueDict)core)Variable_current_expected_placedefault_main_programdevice_guardin_dygraph_modein_dynamic_or_pir_modein_pir_mode
name_scope)L2DecayWeightDecayRegularizer   )	frameworkunique_name)_get_no_grad_set_name_get_no_grad_set_valueappend_backward)	Parameter)LayerHelperLayerHelperBase)
get_logger   )LambdaDecayLRScheduler)CallableSequence)NotRequired	TypedDict)Tensor)GradientClipBase)OperatorProgramc                  .    e Zd ZU ded<   ded<   ded<   dS )_ParameterConfigzSequence[Tensor]paramsz2NotRequired[float | WeightDecayRegularizer | None]weight_decayz0NotRequired[float | Tensor | LRScheduler | None]learning_rateN)__name__
__module____qualname____annotations__     j/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/optimizer/optimizer.pyr+   r+   C   s6             HHHHGGGGGGr4   r+   z&%(asctime)s-%(levelname)s: %(message)s)fmt$FLAGS_shard_bypass_dygraph_optimizerc                *   ddl m}m} t                      }|j        dk    s
J d            |                                }	| D ]}
|
j        |	k    s
J d             ||	            ||	          }|&|                                                                }|	                    ||           \  }}|
                    ||          \  }}g }|D ]@}|<|	j                            |j                  }|dk    sJ |                    |           A|                    t!          |                     |                    |           t%          |          dk    r||fg}n4g }t'          |          D ]"\  }}|                    |||         f           #|S )Nr   )	Transform	orig2primr   zHThe append_backward_new interface is designed to process only one block.z@variable in loss_list should be in current block of main program)paddle.incubate.autograd.primxr9   r:   r   
num_blockscurrent_blockblockglobal_blockall_parameters	linearize	transposeopsindexopappend	erase_opssorted
erase_dotslen	enumerate)	loss_listparameter_listno_grad_set	callbackscheckpointsdistop_contextr9   r:   programr>   elad	param_dotloss_dotloss_bar	param_bar
op_indexesvarop_indexparams_and_gradsiparams                         r5   append_backward_newr_   U   s    DCCCCCCC"$$G"""R #"" !!##E 
 
x5   N !    Ie	5		B --//>>@@,,~yAAIx,,x;;Hi J ( (?ysv..Hq====h'''LL
##$$$MM)
>a+Y78!.11 	; 	;HAu##UIaL$9::::r4   c                     e Zd ZU dZded<   ded<   ded<    ej                    	 	 	 	 dedfd            Zd Zd Z	d Z
d Zd Z ej                    d             Zej        dgd            Zej        dhd            ZeZdid!Zd" Zej        djd%            Zej        dkd(            Zdld)Zdmd*Zd+ Zd, Zd- Zd. Zd/ Zd0 Zd1 Z	 	 	 	 	 dnd3Z d4 Z!d5 Z"d6 Z#d7 Z$	 dod9Z%	 dod:Z&	 	 	 	 dedpdEZ'dqdHZ(	 dodIZ)dmdJZ*	 dmdrdLZ+dmdMZ,ej-        dsdtdQ            Z.ej-        dsdudS            Z/ ej                    	 	 	 dvdwdU            Z0dV Z1 ej                    ej-        dxdW                        Z2dX Z3dY Z4ej        dZ             Z5ej        d[             Z6d\ Z7d] Z8d^ Z9d_ Z:e;d`             Z<e;da             Z=e;db             Z>e;dc             Z?e;dd             Z@dS )y	Optimizera  Optimizer Base class.

    Define the common interface of an optimizer.
    User should not use this class directly,
    but need to use one of it's implementation.

    Args:
        learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
            It can be a float value or any subclass of ``LRScheduler`` .
        parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in parameter groups \
            represents the scale of base learning_rate. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        weight_decay (int|float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
            It can be a int or float value as coeff of L2 regularization or \
            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of \
            some derived class of ``GradientClipBase`` . There are three clipping strategies \
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        name (str|None, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    Returns:
       Base class for optimizer.

    Examples:
        .. code-block:: python

            >>> # Take the subclass adam as an example
            >>> import paddle
            >>> linear = paddle.nn.Linear(10, 10)
            >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
            >>> out = linear(inp)
            >>> loss = paddle.mean(out)
            >>> adam = paddle.optimizer.Adam(
            ...     learning_rate=0.1,
            ...     parameters=linear.parameters()
            ... )
            >>> loss.backward()
            >>> adam.step()
            >>> adam.clear_grad()

            >>> #Take the subclass sgd as an example
            >>> #optimize parameters in linear_1 and linear2 in different options.
            >>> #Note that the learning_rate of linear_2 is 0.01.
            >>> linear_1 = paddle.nn.Linear(10, 10)
            >>> linear_2 = paddle.nn.Linear(10, 10)
            >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
            >>> out = linear_1(inp)
            >>> out = linear_2(out)
            >>> loss = paddle.mean(out)
            >>> sgd = paddle.optimizer.SGD(
            ...     learning_rate=0.1,
            ...     parameters=[{
            ...         'params': linear_1.parameters()
            ...     }, {
            ...         'params': linear_2.parameters(),
            ...         'weight_decay': 0.001,
            ...         'learning_rate': 0.1
            ...     }],
            ...     weight_decay=0.01)
            >>> loss.backward()
            >>> sgd.step()
            >>> sgd.clear_grad()

    WeightDecayRegularizer | NoneregularizationzLayerHelperBase | NonehelperzCallable[[bool], None]clear_gradientsNr.   float | LRScheduler
parameters4Sequence[Tensor] | Sequence[_ParameterConfig] | Noner-   %float | WeightDecayRegularizer | None	grad_clipGradientClipBase | Nonename
str | NonereturnNonec                `   |st          |t          j                  r t          dt	          |           d          t          |t
                    rt          d          t          |          | _        nd | _        || _        t          j
                    rs| j        t          d          |[t          | j        d         t
                    s;| j        D ]3}t          |d          r!|j        t          j        d| d            n4t          |t           t"          f          s t          d	t	          |           d
          |3t          |t          j        j        j                  st          d          t          |t                     rt+          |          | _        n>t          |t.                    r"t+          t!          |                    | _        n|| _        || _        || _        d | _        | j        rst          | j        d         t
                    r<| j        D ]}d|v s
J d            | j        d         d         d         j        | _        n| j        d         j        | _        i | _        t;          d           | _        d | _        g | _         i | _!        i | _"        | j#        | _$        | j        | j        d| _%        g | _&        | j        rRt          | j        d         t
                    r2| j        D ])}| '                    |(                                           *n| j        | _&        d | _)        | *                                | _+        i | _,        t[                      | _.        i | _/        | 0                                 d| _1        d| _2        d | _3        d| _4        d | _5        d S )Nzp`parameters` argument given to the optimizer should be an iterable of paddle Tensors, but got argument type is `z`.zv`parameters` argument should not get dict type, if parameter groups is needed, please set `parameters` as list of dictzNparameters argument given to the Optimizer should not be None in dygraph mode.r   regularizerz{If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. The weight_decay[U] in Optimizer will not take effect, and it will only be applied to other Parameters!z2learning rate should be float or LRScheduler, got z herezE'grad_clip' should be an instance of GradientClipBase's derived classr,   zYparams should be set in parameters if parameter groups are optimized in different optionsc                     i S Nr3   r3   r4   r5   <lambda>z$Optimizer.__init__.<locals>.<lambda>)  s     r4   )r-   rj   F)6
isinstancepaddler&   	TypeErrortypedictlist_parameter_list_namer   r   AttributeErrorhasattrrq   logginginfofloatr!   nnclipr'   r   rc   int
_grad_clip_learning_rate_dtypedtype_learning_rate_mapr   _accumulatorsrd   _opti_name_list_accumulators_holder_param_device_map
clear_gradre   _default_dict_param_groups_add_param_groupcopy_use_multi_tensor_create_multi_tensor_dict_param_dict_auxiliary_varsset_already_create_accumulator_master_weights_create_master_grad_states_use_fusion_storage_need_refusefusion_storage_fuse_buffer_versionmerged_model_params)selfr.   rg   r-   rj   rl   r^   param_groups           r5   __init__zOptimizer.__init__   s    ! *fm44 ePTU_P`P`e e e   *d++ '  
 $(
#3#3D  #'D 
$&& 	"#+$d   '!$"6q"94@@ 
"!%!5 	" 	"#E=99" % 1 =#L!X4@!X !X !X   "E-%)=>> 	_T-EXEX___    i)HII [   lE** 	/"),"7"7Dc** 	/")%*=*=">">D".D#+ 	<$.q1488 <#'#7  K#{222s 3222 #215h?BH"215; #%
 )44!$&!!## /
 

   	6Jt/CA/F$M$M 	6#3 : :%%k&6&6&8&89999: "&!5D "&99;;!+.55(!''))) $) !"$%!#'   r4   c                f    t                      rt                      | _        ni | _        d| _        d S NF)r   r	   _master_grads_master_gradr   s    r5   r   z$Optimizer._create_master_grad_statesN  s3    == 	$!*D!#D!r4   c                    || j         |<   d S rt   )r   )r   keyvals      r5   _set_auxiliary_varzOptimizer._set_auxiliary_varV  s    $'S!!!r4   c                    | j         t          | j                   nd}d t          |          D             d t          |          D             dS )Nr   c                    g | ]}g S r3   r3   .0_s     r5   
<listcomp>z7Optimizer._create_multi_tensor_dict.<locals>.<listcomp>\       6 6 6 6 6 6r4   c                    g | ]}g S r3   r3   r   s     r5   r   z7Optimizer._create_multi_tensor_dict.<locals>.<listcomp>]  r   r4   )FP32_DenseTensorFP16_DenseTensor)r   rJ   range)r   ns     r5   r   z#Optimizer._create_multi_tensor_dictY  s\    '+'9'EC"###1 6 6U1XX 6 6 6 6 6U1XX 6 6 6
 
 	
r4   c                8    | j                             |d           S rt   )r   get)r   r   s     r5   _get_auxiliary_varzOptimizer._get_auxiliary_var`  s    #''T222r4   c                <    || _         |                                  d S rt   )r   need_refuse)r   r   s     r5   set_merged_model_paramsz!Optimizer.set_merged_model_paramsc  s"    #6 r4   c                   ddl m} t          j                    sd S | j        j        dk    rd S | j        | j                                        D ]M\  }}|                                D ]3\  }}|	                    | j                  s| 
                                 4N| j                                        D ]3\  }}|	                    | j                  s| 
                                 4| j        sd S t                              d| j                     || j        | j        | j                  | _        | xj        dz  c_        |                                  t                              d| j                    d S )Nr   )FusionStorageAdamWz,refuse optimizer fuse buffer version start: z*refuse optimizer fuse buffer version end: )fusion_utilsr   r   r   	__class__r/   fused_states_bufferr   items_is_shared_buffer_withr   r   r   local_loggerwarningr   r   r   reset_need_refuse)r   r   r   vvvs        r5   _maybe_refusezOptimizer._maybe_refuseg  s   ////// (** 	F >"g--F #/*0022 + +1WWYY + +EAr44T5MNN +((***+ ,2244 ' '1//0HII '$$&&&  	FV4;TVV	
 	
 	
 ,m $
 

 	!!Q&!!   T9RTT	
 	
 	
 	
 	
r4   dict[str, Tensor]c                   i }t          | j                  dk    r=t          | j                  dk    r%| j                                        D ]
\  }}|||<   n| j                                        D ]\  }}|                                D ]o\  }}|||j        <   t          j                    rMt          j        dd          }|dk    r1|	                                
                                ||j        dz   <   pt          | d          r"t          | j                  dk    r
| j        |d<   t          | j        t                    r| j                                        |d	<   |S )
a  
        Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
        If the optimizer never be called(minimize function), the state_dict is empty.


        Returns:
            dict[str,Tensor], dict contains all the Tensor used by optimizer

        Examples:
            .. code-block:: python

                >>> import paddle
                >>> emb = paddle.nn.Embedding(10, 10)

                >>> adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
                >>> state_dict = adam.state_dict()

        r   xpu_adamw_moment_dtypefp32defaultfp16.SCALE_VALUEr   master_weightsLR_Scheduler)rJ   r   r   r   rl   r
   is_compiled_with_xpuosgetenv
get_tensorget_xpu_scale_valuer   r   rv   r   r!   
state_dict)	r   r   rl   rZ   kr   	para_namevar_tmpr   s	            r5   r   zOptimizer.state_dict  s   ( 
t!""a''C0I,J,JQ,N,N!6<<>> ' '	c#&
4  ' *0022  1*+'')) 
 
&Iw/6Jw|,022 134f2 2 2. 2V;; ' 2 2 4 4 H H J J 'w|n'DE
 4*++ 	D4'((A--/3/C
+,d);77 	J)-)<)G)G)I)IJ~&r4   r   c                   t          | j        t                    rX|                    dd          }t          | j        t                    s|
J d            |r| j                            |           |                                }d|v r|                    d           d|v r2t          | d          r|d         | _	        |                    d           || _
        | j                                        D ]\  }}|                                D ]\  }}|j        |v sJ d|j         d            |                                }|                                }t!          j                    rMt%          j        dd	
          }	|	dk    r1|                    |                    |j        dz   d                     |                    ||j                            ʌdS )a9  
        Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.

        Args:
            state_dict(dict): Dict contains all the Tensor needed by optimizer

        Return:
            None

        Examples:
            .. code-block:: python

                >>> import paddle

                >>> emb = paddle.nn.Embedding(10, 10)

                >>> layer_state_dict = emb.state_dict()
                >>> paddle.save(layer_state_dict, "emb.pdparams")

                >>> scheduler = paddle.optimizer.lr.NoamDecay(
                ...     d_model=100, warmup_steps=100, verbose=True)
                >>> adam = paddle.optimizer.Adam(
                ...     learning_rate=scheduler,
                ...     parameters=emb.parameters())
                >>> opt_state_dict = adam.state_dict()
                >>> paddle.save(opt_state_dict, "adam.pdopt")

                >>> opti_state_dict = paddle.load("adam.pdopt")
                >>> adam.set_state_dict(opti_state_dict)

        r   NzHLR_Scheduler state must be included in the state dict except LambdaDecayr   r   zoptimizer Tensor z
 not foundr   r   r   r   r         )rv   r   r!   r   r    set_state_dictr   popr   r   r   r   r   rl   valuer   r
   r   r   r   set_xpu_scale_value	set_value)
r   r   lr_state_dictr   r   r   r   rZ   tensorr   s
             r5   r   zOptimizer.set_state_dict  s   B d);77 	B&NN>4@@Md1;?? $00^ 100  B#22=AAA  __&&
Z''NN>***z))t.// D'12B'C$NN+,,,$.!&,,.. 	8 	8DAq&'ggii 8 8"	7|z111@@@@ 211 mmoo)),.. -/Y0&. . .* .7722&NN7<.+H$OO   j67777!8	8 	8r4   	list[str]c                    | j         S rt   )r   r   s    r5   get_opti_var_name_listz Optimizer.get_opti_var_name_list  s    ##r4   c                      fd}t           j        j                                        5   |             d d d            d S # 1 swxY w Y   d S )Nc            
     ^   j         t          j                    nj         } t          j                    dk    r| t          j        k    s't          j                    dk    r| t          j        k    rt          j        n| } t          j        t                    r	                                }t                      rt          j                                        }t          j                                        }t          j        d          }t!                                                    }t          j                            |          5  t          j        j                            |          }t          j        j                            g |           } |||                                          }d|_        t5          ||           d d d            n# 1 swxY w Y   |                    |           t          |t          j        j                  s|j        _        t          j                            |          5  t=          || g           }	d d d            n# 1 swxY w Y   d|	_        d|	_        j        |_         |	|_!        ||_"        |	j#        |<   d S d S t          |tH          j%                  st          j        d          }|j        _        j&        '                    |g dd|           }tI          j                    }
j        |
_         ||
_!        |j#        tI          j                    <   t!                                                    }j&        (                    |t          j        j                            |                     d S t          j        t                     r	                                }t                      rRt          |t          j        j                  rd S tS                      }t          | t          j*        j        j+                  sjt          | t          j*        j,        j-        j.                  rt          j        j        j/        |          } n$t          j        j        0                    |           } t          j        j        1                    | g t          j        d          t          j        j        2                    t!          j                                      j#        t          j                                        <   d S t          |tH          j%                  rd S t          j        3                    t          j        d          g t!          j                  | d	          j#        tI          j                    <   d S d S )
Nfloat16bfloat16r.   r   T)rl   shapepersistablestop_gradientr   initializer)r   r   rl   r   rl   r   r   r   r   )4r   rw   get_default_dtyper   r   float32rv   r   r!   _global_learning_rater   staticdefault_startup_programr   r   generater   program_guardr   r   Constantpirr
   ParameterMetar?   r   r   set_parameters_fromValue	_var_namer   r   lr_schedulerlr_varlr_namer   r   r   rd   create_global_variableset_variable_initializerr   baseDataType	libpaddleVarDescVarTypevartype_to_datatypeconvert_np_dtype_to_dtype_create_persistable_valueConstantInitializercreate_global_var)	_lr_dtyper  startup_programmain_programr  lr_valuer   parameter_metainit_resultr^   	main_proglrplacer   s                r5   	do_createz9Optimizer._create_global_learning_rate.<locals>.do_create	  s    ;& (***[  022i??%77 022j@@%88    $-{;; h3355== 9&,m&K&K&M&MO#)=#E#E#G#GL)2?CCG$T%8%8%:%:;;H44_EE < <&,i&;&D&D"* 'E ' ' *0)F)F	* * '2k*O,H,H,J,J' ' 37/%k7;;;< < < < < < < < < < < < < < < !44_EEE%ffj.>?? 	F8?+5#]88FF F F$-gy"$E$EEF F F F F F F F F F F F F F F.2+,0)484G1.3+/6,@E/===	F 	F &fi.@AA #"-"6"G"G8?+5!%!C!C!("$(,*."+ "D " " %.$B$D$D	151D	.+1	( # /%:<<  %T%8%8%:%:;;HK88$*I$9$B$B"* %C % % 9      D/77 +//11== (!"fj&677  7 9 9))V[5E5NOO ") )6;+@+H+P    " -3JO,O$--"		
 %+JO$N$N(1%& %& !* #JODD"+"$!,!5o!F!F(.	(=(Q(Q&+D,?&@&@ )R ) )	 E   /"M>>@@   ""i&899  #M;;!,!5o!F!F"$"'(;"<"<"+(, <   /%:<<  G+ +s%   B GG GI!!I%(I%)rw   r	  r   dygraph_guard_if_declarative)r   r  s   ` r5   _create_global_learning_ratez&Optimizer._create_global_learning_rate  s    }	 }	 }	 }	 }	~ ["??AA 	 	IKKK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   AA	Ar   r   c           	        t          |t          t          f          s t          dt	          |           d          t          | j        t                    rt          d          t          |          | _        |                                 }|t                      rLt                      }t          j        |t          |j                  t          |          |j        |           dS t!          j                                                    }|                    dd|gi|j        t          |j                  t          |          dd	           dS dS )
a  
        :api_attr: imperative

        Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
        this API cannot be invoked, because it will lead to conflict.

        Args:
            value (float): the value of learning rate.

        Returns:
            None

        Examples:
            .. code-block:: python

                >>> import paddle
                >>> linear = paddle.nn.Linear(10, 10)

                >>> adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())

                >>> # set learning rate manually by python float value
                >>> lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
                >>> for i in range(5):
                ...     adam.set_lr(lr_list[i])
                ...     lr = adam.get_lr()
                ...     print("current lr is {}".format(lr))
                current lr is 0.2
                current lr is 0.3
                current lr is 0.4
                current lr is 0.5
                current lr is 0.6

        zDThe type of 'value' in optimizer.set_lr must be float, but received .zhoptimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict.Nfill_constantOut)r   r   r   T)ry   outputsattrsr   )rv   r   r   rx   ry   r   r!   RuntimeErrorr   r   r   r   full_r{   r   r   r   r   r?   	append_op)r   r   
current_lrr  r?   s        r5   set_lrzOptimizer.set_lr  sv   F %#u.. 	eW[\aWbWbeee   d);77 	z   $Ell//11
!   /11)**%LL$      )=??LLNN&&("ZL1!+!1!%j&6!7!7!&u 
 #' ' 	 	 	 	 	 "!r4   	schedulerr!   c                    ddl m} t          ||          s t          dt	          |           d          || _        dS )a  
        :api_attr: imperative

        Set the LRScheduler of the learning rate manually in the optimizer. If the optimizer already used LRScheduler previously,
        this API will set it be the new one.

        Args:
            scheduler (LRScheduler): the LRScheduler of learning rate

        Returns:
            None

        Examples:
            .. code-block:: python

                >>> import paddle
                >>> linear = paddle.nn.Linear(10, 10)

                >>> adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())

                >>> # set learning rate manually by class LRScheduler
                >>> scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2,4,6], gamma=0.8)
                >>> adam.set_lr_scheduler(scheduler)
                >>> lr = adam.get_lr()
                >>> print("current lr is {}".format(lr))
                current lr is 0.5

                >>> # set learning rate manually by another LRScheduler
                >>> scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.1, step_size=5, gamma=0.6)
                >>> adam.set_lr_scheduler(scheduler)
                >>> lr = adam.get_lr()
                >>> print("current lr is {}".format(lr))
                current lr is 0.1

        r   )r!   zXThe type of 'scheduler' in optimizer.set_lr_scheduler must be LRScheduler, but received r   N)paddle.optimizer.lrr!   rv   rx   ry   r   )r   r*  r!   s      r5   set_lr_schedulerzOptimizer.set_lr_scheduler  sc    J 	433333)[11 	}kopykzkz}}}   (r4   c                l    t          | j        t                    r| j        S |                                 S )a  
        Get current learning rate of optimizer.
        If 'LRScheduler' is not used, the return value is all the same.
        If 'LRScheduler' is used, the return value is the current scheduled learning rete.

        Returns:
            float, The current learning rate of optimizer.

        Examples:
            .. code-block:: python

                >>> # train on default dynamic graph mode
                >>> import paddle
                >>> import numpy as np
                >>> emb = paddle.nn.Embedding(10, 3)

                >>> ## example1: LRScheduler is not used, return the same value is all the same
                >>> adam = paddle.optimizer.Adam(0.01, parameters = emb.parameters())
                >>> for batch in range(10):
                ...     input = paddle.randint(low=0, high=5, shape=[5])
                ...     out = emb(input)
                ...     out.backward()
                ...     print("Learning rate of step{}: {}".format(batch, adam.get_lr())) # 0.01
                ...     adam.step()
                Learning rate of step0: 0.01
                Learning rate of step1: 0.01
                Learning rate of step2: 0.01
                Learning rate of step3: 0.01
                Learning rate of step4: 0.01
                Learning rate of step5: 0.01
                Learning rate of step6: 0.01
                Learning rate of step7: 0.01
                Learning rate of step8: 0.01
                Learning rate of step9: 0.01

                >>> ## example2: StepDecay is used, return the scheduled learning rate
                >>> scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=2, gamma=0.1)
                >>> adam = paddle.optimizer.Adam(scheduler, parameters = emb.parameters())
                >>> for batch in range(10):
                ...     input = paddle.randint(low=0, high=5, shape=[5])
                ...     out = emb(input)
                ...     out.backward()
                ...     print("Learning rate of step{}: {}".format(batch, adam.get_lr())) # 0.5->0.05...
                ...     adam.step()
                ...     scheduler.step()
                Learning rate of step0: 0.5
                Learning rate of step1: 0.5
                Learning rate of step2: 0.05
                Learning rate of step3: 0.05
                Learning rate of step4: 0.005000000000000001
                Learning rate of step5: 0.005000000000000001
                Learning rate of step6: 0.0005000000000000001
                Learning rate of step7: 0.0005000000000000001
                Learning rate of step8: 5.000000000000001e-05
                Learning rate of step9: 5.000000000000001e-05

                >>> # train on static graph mode
                >>> paddle.enable_static()
                >>> main_prog = paddle.static.Program()
                >>> start_prog = paddle.static.Program()
                >>> with paddle.static.program_guard(main_prog, start_prog):
                ...     x = paddle.static.data(name='x', shape=[None, 10])
                ...     z = paddle.static.nn.fc(x, 100)
                ...     loss = paddle.mean(z)
                ...     scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=2, gamma=0.1)
                ...     adam = paddle.optimizer.Adam(learning_rate=scheduler)
                ...     adam.minimize(loss)

                >>> exe = paddle.static.Executor()
                >>> exe.run(start_prog)
                >>> for batch in range(10):
                ...     print("Learning rate of step{}: {}".format(batch, adam.get_lr())) # 0.5->0.05->0.005...
                ...     out = exe.run(main_prog, feed={'x': np.random.randn(3, 10).astype('float32')})
                ...     scheduler.step()
                Learning rate of step0: 0.5
                Learning rate of step1: 0.5
                Learning rate of step2: 0.05
                Learning rate of step3: 0.05
                Learning rate of step4: 0.005000000000000001
                Learning rate of step5: 0.005000000000000001
                Learning rate of step6: 0.0005000000000000001
                Learning rate of step7: 0.0005000000000000001
                Learning rate of step8: 5.000000000000001e-05
                Learning rate of step9: 5.000000000000001e-05
        )rv   r   r   r   s    r5   get_lrzOptimizer.get_lr  s5    l d)511 	)&&&&(((r4   c                    |@t                      rt          j                    }nt          j                                        }| j                            |d          S )zC
        get global decayed learning rate
        :return:
        N)r   r   r   rw   r   r   r   )r   rR   s     r5   r   zOptimizer._global_learning_rateW  sS    
 ?   ?#8:: -<<>>&**7D999r4   c                     t          d          )zFappend optimize operator to block and return all the added optimize_opzcClass "Optimizer" cannot be used directly as an optimizer, please use its subclasses such as "Adam")NotImplementedError)r   r>   param_and_grads      r5   _append_optimize_opzOptimizer._append_optimize_opc  s    !q
 
 	
r4   c                P   |d         }t          |d          r|j        d|j        v r|j        d         }t          |t          t          j        j        f          r|S |dk    r|                                 S t          j        	                                
                    d          5  t          j        d          5  |                                 |z  cd d d            cd d d            S # 1 swxY w Y   d d d            d S # 1 swxY w Y   d S |                                 S )Nr   optimize_attrr.         ?T)is_with_optscale_with_param_lr)r   r6  rv   r   rw   r   r  r   r   r   _lr_schedule_guardr   r   )r   r3  r^   param_lrs       r5   _create_param_lrzOptimizer._create_param_lri  s   q!E?++	0#/5#666*?;H(Xvz/?$@AA Gs??55777 ::<<OO(, P  G G ",-BCC	G G  $99;;hFG G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G --///s6   *D?C/D/C3	3D6C3	7DDDc                X   |j         | j        v r| j        |j                  }n|                     |          }t                      rt          j                                        }t          j                                        }t          j                            |          5  d } |||j                   }t	          j	        |d          }d|_
        t          j                            ||           d d d            n# 1 swxY w Y   t          j                            |          5  t          j                                         t          j                            ||j        |j        t%          j                              }|                                r|                    |                                           t          j        j        j                            |                                j        g |                                g          }	|	|                                _        d|_
        d d d            n# 1 swxY w Y   nt;          j                    rt	          j	        |d          }||_         nt?          | j         tB                    sJ t          j        "                    ||j        ddd          }| j         j#        $                                }
|
%                    dd|gid|gi|j        t$          j&        j'        j(        d	
           || j        |j         <   |S )Nc                    |                                  j        D ]a}|                                dk    rG||                                d         k    r)|                    d                                          c S bd S )Nzbuiltin.set_parameterparameter_namer   )r?   rC   rl   r$  operandsource)startuprl   rE   s      r5   get_param_from_startupz?Optimizer._create_master_weight.<locals>.get_param_from_startup  su    ")"6"6"8"8"< > >B "		-D D D$(BHHJJ7G,H$H$H')zz!}}';';'='= = = =#tr4   r   Tr   r   castXr"  )in_dtype	out_dtype)ry   inputsr#  r$  ))rl   r   _gen_master_weight_var_namer   rw   r   r   r   r   rD  r   _pir_opsset_persistable_valuer   reset_insertion_point_to_startdatar   r   r
   Placeis_distset_typery   r	  r  create_op_dist_attribute	dist_attrprocess_meshget_defining_opr   r   rv   rd   r   r  r  r?   r'  r  r  FP32)r   r^   rZ   var_namer  r  rC  startup_paramstartup_varop_dist_attrr>   s              r5   _create_master_weightzOptimizer._create_master_weight  sv   :---&uz2CC77>>H}} <"(-"G"G"I"I%}AACC]00AA Q Q$ $ $ %;$:'% %M #)+mY"G"GK.2K+O99+xPPP!Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q" ]00>> + +J==??? -,, #)#)
	 C #**,, 	G[%5%5%7%7888"K15NN + 5 5 7 7 D "!,!6!6!8!8 9  % ;G++--7&*CO%+ + + + + + + + + + + + + + +& *,, k%33#!$+{;;;;;m55!+# $ 6   3@@BB%>"SEN$)K%)\%9%> 	      03D ,
s&   "AC??DD)DIIIc                >    |j         dz   }t          j        |          S )N_fp32_master)rl   r   r   )r   r^   rV  s      r5   rI  z%Optimizer._gen_master_weight_var_name  s    :.#H---r4   c           	        |                      |j                  sJ t                      r_|| j        v r| j        |         }nt	          j        |d          }|                                                    dd           || j        |<   n|j        | j        v r| j        |j                 }nb|j        dz   }t          j
        |          }|j                            ||j        dd|j        |j        |j                  }|| j        |j        <   |S )Nr   master_grad_castTr\  r   )rl   r   r   r   	lod_levelr   is_data)_is_dtype_fp16_or_bf16r   r   r   rw   rD  rT  set_bool_attrrl   r   r   r>   
create_varr   r_  r   r`  )r   gradrZ   rV  s       r5   _create_master_gradzOptimizer._create_master_grad  s   **4:66666== 	4t)))(.k$	22##%%334FMMM+."4((yD...(39~5&/99j++!*#"n $ 0 L ,   14"49-
r4   c                    dS )zCreate all accumulators needed by the parameters

        Args:
            block: the block in which the loss tensor is present
            parameters: list of parameter tensors for the optimizer
        Nr3   )r   r>   rg   s      r5   _create_accumulatorszOptimizer._create_accumulators  	     	r4   c                    dS )a  Finish any custom updates needed
           before completing an optimization step

        Args:
            block: the block in which the loss tensor is present
            parameters: list of parameter tensors for the optimizer

        Returns:
            None
        Nr3   )r   r>   parameters_and_gradss      r5   _finish_updatezOptimizer._finish_update  s	     	r4           c           
     j   | j         | j         dz   |z   }|| j        v rY|j        | j        |         v rEt          j                    r| j        |         |j                 S t          d| d|j                   |                                  ||j        }|j        dz   |z   }t          j	        |          }| j
                            |           ||                     |j                  }t                      rd|vrtt          j        j                            |p|j        ||t          j        j                            t-          |                    |                                          }	nt          j        j                            |p|j        ||t          j        j                            t-          |                              }	nL| j        t3          | j        j                  | _        t9          | j        t2                    sJ | j                            |d	|p|j        t           j        j        j         |d	
          }	t	                      rn|dk    st9          |t           j!                  rNtE          j#        |	|	j        tI          t-          |                    |	j        t!          j!                               nstK          |          5  | j        &                    |	t          j        j                            t-          |                               ddd           n# 1 swxY w Y   t          j                    rtO          | j(                  dk    r|| j(        v sJ d| d            |	)                    | j(        *                    |                     t!          j+                    r_tY          j-        dd          }
|
dk    rC|	.                                /                    | j(        0                    |dz   d                     |	| j        |         |j        <   |	S )a|  Utility function to add an accumulator for a parameter

        Args:
            block: the block in which the loss tensor is present
            name: name of the accumulator
            param: parameter tensor for which accumulator is to be added
            dtype: data type of the accumulator tensor
            fill_value: value to initialize the accumulator tensor
        Nr   Accumulator z already exists for parameter betar   )r   rR  r   T)rl   r   r   ry   r   belong_to_optimizercpur   zOptimizer set error, z should in state dictr   r   r   r   r   r   )1r}   r   rl   r   r   	Exceptionr   r   r   r   r   rF   _get_device_for_paramr   rw   r   r
   r  r   r   r   r   r   rR  rd   r   r   r/   rv   r  r  r  DENSE_TENSORCPUPlacer   r&  strr   r  rJ   r   r   r   r   r   r   r   r   r   )r   rl   r^   r   
fill_valuer   ry   devicerV  rZ   r   s              r5   _add_accumulatorzOptimizer._add_accumulator  sx   & :!:#d*DD&&&
d0666(** <)$/
;;OtOO5:OO  
 =KE:#d*'11##H--->//
;;F== E	X%%jo>>(U[ &	 5 > >#J// !? ! ! $oo// ?   jo>>(U[ &	 5 > >#J// !? ! !	 ?   {")$.*ABBdk;77777+44 *u{\)6$( 5  C    %:fdm#D#DIj))**IMOO    "&))  K88$*I$9$B$B"'
"3"3 %C % % 9                  (** t011A55#t'@@@@OOOO A@@ MM$";"?"?"I"IJJJ 022 	134f2 2 2. 2V;;NN,,@@ $ 9 = =$,~$=t!" !"   034 ,
s   AL22L69L6c                    | j         | j         dz   |z   }|| j        vs|j        | j        |         vrt          d| d|j                   | j        |         |j                 S )a  Utility function to fetch an accumulator for a parameter

        Args:
            name: name of the accumulator
            param: parameter tensor for which accumulator is to be fetched

        Returns:
            accumulator tensor for the parameter
        Nr   rn   does not exist for parameter )r}   r   rl   rr  )r   rl   r^   s      r5   _get_accumulatorzOptimizer._get_accumulatorq  s~     :!:#d*D***z!3D!999OtOO5:OO   !$'
33r4   c                &   | j         | j         dz   |z   }| j        o|                     |j                  }|r| j        |j                 n|}|j        }|| j        vs|| j        |         vrt          d| d|           | j        |         |         S )a
  Utility function to fetch an accumulator for a parameter
        Args:
            name: name of the accumulator
            param: parameter variable for which accumulator is to be fetched
        Returns:
            accumulator variable for the parameter
        Nr   rn  r{  )r}   _multi_precisionra  r   r   rl   r   rr  )r   rl   r^   find_mastertarget_paramtarget_names         r5   _get_accumulator_masterz!Optimizer._get_accumulator_master  s     :!:#d*D+ 
0K0KK1
 1
 1<FD ,, 	 #'***$"4T":::PtPP;PP   !$'44r4   c                    |D ]r}|d         j         du ra|d         j        }|j        }t          j                                        }|D ],}|j        }||v r|                    |          | j        |<    n-sd S )Nr   F)	r   rl   rC   r
   op_proto_and_checker_makerkOpDeviceAttrNameinput_arg_namesattrr   )	r   rj  target_blockr3  
param_namerC   device_attr_namerE   r  s	            r5   _update_param_device_mapz"Optimizer._update_param_device_map  s    2 	 	Na .%77+A.3
"&3EEGG !   B&(&8O!_44=?WW,> >.z: 	 5	 	r4   c                6    d }|| j         v r| j         |         }|S rt   )r   )r   r  rx  s      r5   rs  zOptimizer._get_device_for_param  s'    ///+J7Fr4   r   c           	     j   t          j                                                    }|}t          j                                                    }|j        |j        k    r8|j        dk    s
J d            t          j                    j        |j                 }t          |j                  }t          | j
        j                  | _        |                                  | j        r,| j
        j        dv rt          | j        d         |                   dk    rt          | j        d         |                   dk    r{t!          |t"                    r*|dk    sJ |                     |d |D             |           n<|                     |           |                     |d |d	         D             |           t          j                    r|                     |||
           no|                     ||           g }|D ]M}|d         j        s>|d         6|                    |d                    |                    |d                    N|d         j        j                            |          5  t9          d          5  |                     |d         j                  }	t?          |	          5  |                     |||
           ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   nBt          j                    s5t!          |t@                    r|d	         n|}
|                     |
|           t!          |t"                    rtB          j"        j         #                                5  d}|D ]\  }}tI          |d          rd} n|r,tB          j%        j&        j'        (                    || |           n | )                    |d |D                        ddd           n# 1 swxY w Y   nz|*                                }d |d	         D             |d	<   tB          j"        j         #                                5  | )                    ||           ddd           n# 1 swxY w Y   t          j                    r| +                    d          }dtB          j,        -                                v r*|(|.                                r|/                                }|r7t!          |t`          j1        j2                  r| 3                    dd           nt!          |t`          j1        j2                  r| 3                    dd           t!          |t"                    rc| 4                                 |D ]J}|d         |d         5                                s%|d         j        du r| 6                    ||           Knj|d	         D ]}|d         |d         5                                s%|d         j        du rNi }||d	<   |7                    d |8                                D                        | 6                    ||           n|D ]}|d         |d         j        j                            |          5  t9          d          5  |d         j        du r]|                     |d         j                  }	t?          |	          5  | 6                    ||          }ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   | 9                    ||           tB          j"        j0        :                    d           t          |j                  }|;                    ||          S )  Add optimization operators to update gradients to tensors.

        Args:
          parameters_and_grads(list(tuple(Tensor, Tensor))):
            a list of (tensor, gradient) pair to update.

        Returns:
          return_op_list: a list of operators that will complete one step of
            optimization. This will include parameter update ops, global step
            update ops and any other custom ops required by subclasses to manage
            their internal state.
        zFcurrent block is not global_block, but it doesn't have backward block.)MomentumAdamr   r   r   c                8    g | ]}|d          j         |d          S r   r   r   ps     r5   r   z7Optimizer._create_optimization_pass.<locals>.<listcomp>  9        !#$Q4#5aD  r4   c                8    g | ]}|d          j         |d          S r  r  r  s     r5   r   z7Optimizer._create_optimization_pass.<locals>.<listcomp>  r  r4   r,   param_group_idxr   N	optimizerF_need_shard_autoTc                8    g | ]}|d          j         |d          S r  r  r  s     r5   r   z7Optimizer._create_optimization_pass.<locals>.<listcomp>:  s9       $%'(t'9 !!  r4   c                8    g | ]}|d          j         |d          S r  r  r  s     r5   r   z7Optimizer._create_optimization_pass.<locals>.<listcomp>B  s9     - - -Q4--aD- - -r4   	found_infxpuc                &    i | ]\  }}|d k    ||S r,   r3   r   r   r   s      r5   
<dictcomp>z7Optimizer._create_optimization_pass.<locals>.<dictcomp>r  s/     %& %& %&,0Aq+,== )*1+8==r4   )<r   r   r?   r=   idxbackward_block_idxblocksrJ   rC   r   r   r/   rd   r  r   r   rv   r{   _multi_tensor_init_update_param_groupr    _append_optimize_multi_tensor_opr  r   rF   r>   rR   _optimized_guardr   rs  rl   r   rz   rw   r	  r  r   distributedauto_parallelfully_shardshard_accumulatorsrg  r   r   rx  
get_devicerO  _local_valuer
   eagerr&   r   r   _is_initializedr4  updater   rk  _set_warmup
_slice_ops)r   rj  r  r?   r  r=   startparam_grad_listr3  rx  params_grads_device_map_need_shardr^   r   params_acc_dictr  param_grad_dictoptimize_opends                      r5   _create_optimization_passz#Optimizer._create_optimization_pass  s
   4 !577DDFF#!688FFHH 000 3r999X :99 %9;;B0L L$%%!$."9::))+++ ! k	"dn&= B
 '
 '

 D$%78IJJaOO();<_MNN  2D99 *a////++$ %9  
 (    ,,-ABBB++$ %9(%C  
 (   (**  55 ($3 6     --(,  
 #%&: B BN*1-;B*1-9'..~a/@AAA'..~a/@AAA#A&,4EE'   {++	  "778J8OPPF%f--  ==(0,; >                                               ,..  ""6==.(22- (
 --+\   .55 M[*GGII  "'K$8 " "q"5*<== "*.K!E" # *8DWW0$    11( )=                  ( #7";";"="=- -,X6- - -)
 [*GGII M M--lOLLLM M M M M M M M M M M M M M M (** B" 33K@@	V]557777!-!))++ . !* 6 6 8 8I ("!)TZ->?? C//TBBB!)TZ->?? D//UCCC!"6== ""**,,,.B " "N
 !/q 1 9'5a'8'H'H'J'J !: !)-a0>%GG $ 8 8$0.!" !" !"" /C8.L " "N .q 1 9'5a'8'H'H'J'J !: !)-a0>%GG24<J 9 / 6 6%& %&4H4N4N4P4P%& %& %&!" !" !" !% 8 8$0/!" !" !"!"( '; " "N%a(0 &q)/7HH* " " #;//	" " *!,:eCC%)%?%? .q 1 6& &F ".f!5!5 " ".2.F.F$0./" /"" " " " " " " " " " " " " " "" " " " " " " " " " " " " " " " " " " " " " " " " " " " " "" 	L*>???$$U+++,"##&&uc222s   /L?0K+/KK+KK+KK+L+K/	/L2K/	3LL	LA-PPP RR
R%]5?\.4\	\.\\.\\."].\22]5\26]]		]		c                   t           j                                                                        }|}|j        d         }|                                  t          |t                    r!|                     |d |D                        n?|	                                }d |d         D             |d<   |                     ||           t          |t                    r4|D ]0}|d         |d         j
        du r|                     ||           1nq|d         D ]h}|d         |d         j
        du rNi }||d<   |                    d	 |                                D                        |                     ||           i|                     ||           t           j        j                            d           |j                            |          dz   }	|j        |	d         S )
r  r  c                8    g | ]}|d          j         |d          S r  r  r  s     r5   r   z;Optimizer._pir_create_optimization_pass.<locals>.<listcomp>  s(    NNN!1Q4;MN1NNNr4   c                8    g | ]}|d          j         |d          S r  r  r  s     r5   r   z;Optimizer._pir_create_optimization_pass.<locals>.<listcomp>  s9     ) ) )t))!) ) )r4   r,   r   Nr   Fc                &    i | ]\  }}|d k    ||S r  r3   r  s      r5   r  z;Optimizer._pir_create_optimization_pass.<locals>.<dictcomp>  s/        $1 H}} q,}}r4   )rw   r   r   r?   rC   r  rv   r{   rg  r   r   r4  r  r   rk  r	  r
   r  rD   )
r   rj  r  r?   r  last_opr  r3  r  start_indexs
             r5   _pir_create_optimization_passz'Optimizer._pir_create_optimization_pass  sE     }99;;HHJJ#"2&))+++ *D11 	E%%NN3NNN   
 37799O) )(2) ) )OH%
 %%lODDD*D11 	L"6 K K!!$,!!$2e;;,,\>JJJ	K #7x"@ L L!!$,!!$2e;;&(O0>OH-#** (<(B(B(D(D     ,,\?KKK 	L*>???$$U+++"&,,W559--r4   lossr&   r  Program | Nonelist[Tensor] | list[str] | NonerN   set[Tensor] | set[str] | NonerO    list[Callable[..., None]] | Nonelist[tuple[Tensor, Tensor]]c                z   d}t          j                    rn|                     ||          }| j        |j        | _        t          j                    rb|r|n| j        }g }t          j                            |          }	t          |	          D ]$\  }
}||
                    ||
         |f           %n|t          j        j        j        g}nt          |t                     sJ |j        j        }t'          j        |j                  dk    sJ d|j         d            |r|n| j        }t          j                            ||          5  t1                      r|2|                                                                }d |D             }g }t          j        j                            |||          }	t          |	          D ]$\  }
}||
                    ||
         |f           %n6ddlm}  |            rtA          |g|||          }ntC          ||||          }ddd           n# 1 swxY w Y   |S )	a+  
        The first part of ``minimize``, do auto-diff to append backward operations for
        the current program.

        Args:
            loss (Tensor): ``loss`` tensor to run optimizations.
            startup_program (Program|None, optional): :ref:`api_paddle_static_Program` for
                initializing parameters in ``parameters``. The default value
                is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
            parameters (list[Tensor]|list[str]|None, optional): List of ``Tensor`` or ``Tensor.name`` to update
                to minimize ``loss``. The default value is None, at this time all parameters
                will be updated.
            no_grad_set (set[Tensor]|set[str]|None, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                to be updated. The default value is None.
            callbacks (list|None, optional): list of callable objects to run when appending backward
                operator for one parameter. The default value is None.

        Return:
            list[tuple[Tensor, Tensor]], list of (param, grad) tensor pairs, param is ``Parameter``,
                grad is the gradient value corresponding to the parameter.

        Examples:
            .. code-block:: python

                >>> import paddle
                >>> x = paddle.arange(26, dtype="float32").reshape([2, 13])

                >>> linear = paddle.nn.Linear(13, 5)
                >>> # This can be any optimizer supported by dygraph.
                >>> adam = paddle.optimizer.Adam(learning_rate = 0.01,
                ...                             parameters = linear.parameters())
                >>> out = linear(x)
                >>> out.backward()
                >>> adam.step()
                >>> adam.clear_grad()
        Nr   zJThe number of elements of loss should be 1, but the current loss.shape is zh, whose number of elements is not 1. Maybe that you should call paddle.mean to process the current loss.c                $    g | ]}|j         d u |S )Fr  r   r^   s     r5   r   z&Optimizer.backward.<locals>.<listcomp>2  s0     * * * %$2e;; ";;;r4   )no_grad_varsr   )prim_enabled)"r   r   _get_no_grad_setr   r   r|   r
   r  get_all_gradsrK   rF   rw   r   r   error_clip_callbackrv   r{   r>   rR   npprodr   r   r   r   r?   r@   autogradir_backwardrd  paddle.incubate.autograd.utilsr  r_   r   )r   r  r  rg   rN   rO   act_no_grad_setrM   params_gradsgradsrD   rd  rR   program_all_paramsr  s                  r5   backwardzOptimizer.backward  s   X $&& 	G"33D+FFO ;*DK$&& 2	+5OZZ4;ON LJ,,^<<E(// G Gt# '')>(EFFFG  #Y^?@		!)T22222j(G74:&&!+++V]a]g V V V ,++ ,6OZZ4;ON,,WoFF  == %- $0022AACC +* *);* * *
 $&L"O7<<n? =  E (1'7'7 O Ot+(//1F0MNNNO LKKKKK#|~~ ':!FNOY( ( (7 ./9( (7              < s   CH00H47H4r  list[Operator]c                t   t          | d          st          |d           }| j        |                     |          }n$t          j        j                            |          }|                     || j                  }t                      r| 
                    |          }n|                     |          }|S )a{  
        Second part of `minimize`, appending optimization operators for
        given `params_grads` pairs.

        Args:
            params_grads (list[tuple[Tensor, Tensor]]): list of (param, grad) pair to do optimization.

        Returns:
            list: A list of operators appended to the current program.

        Examples:
            .. code-block:: python

                >>> import paddle

                >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
                >>> linear = paddle.nn.Linear(10, 10)
                >>> out = linear(inp)
                >>> loss = paddle.mean(out)
                >>> optimizer = paddle.optimizer.Adam(learning_rate=0.1,
                ...         parameters=linear.parameters())
                >>> params_grads = optimizer.backward(loss)
                >>> optimizer.apply_gradients(params_grads)

        _sortedc                    | d         j         S Nr   rl   )xs    r5   ru   z+Optimizer.apply_gradients.<locals>.<lambda>j  s    adi r4   )r   )r   rH   r   rw   r   r   append_gradient_clip_opsappend_regularization_opsrc   r   r  r  )r   r  optimize_opss      r5   apply_gradientszOptimizer.apply_gradientsK  s    < tY'' 	I!,4G4GHHHL ?&??<88LL!9>BB<PPL 55$-
 
 == 	H==lKKLL99,GGLr4   c                |   t          j                    r	t          rdS t                      r/t          j                            t          j                                        t          j                                                  5  t          j	        j
        j                                        }ddlm}  |            }||                                 g }|j        j        D ]>}	|	j        j        j        s+|                    |	j        j        |	j        j        f           ?|}| j        "d| j        _        |j        j        | j        _        n+|r)t          j	        j
        j                            |           t9          |t:                    r8| j        |                     |          }|                     || j                  }nB|d         }
|
 |
|d                   |d<   |                     |d         | j                  |d<   tA                      r| !                    ||          }n| "                    ||          }ddd           n# 1 swxY w Y   na|dk    sJ |j#        j$        }t          j                            ||          5  | %                    |          }ddd           n# 1 swxY w Y   |S )a  
        Second part of `minimize`, appending optimization operators for
        given `params_grads` pairs.
        Args:
            loss (Tensor): loss tensor to run optimizations.
            startup_program (Program): startup_program for initializing parameters
                in `parameters`.
            params_grads (list): list of (param, grad) pair to do optimization.
        Returns:
            list: A list of operators appended to the current program.
        Nr   )get_fsdp_contextTrj   r,   r  )&r   r    g_shard_bypass_dygraph_optimizerr   rw   r   r   r   r   r  r  auto_dp_utilsin_auto_dp_mode3paddle.distributed.auto_parallel.fully_shard_fusionr  comm_sync_and_reset_statusbuffer_managerbuffer_groupsparams_bufferdata_bufferr   rF   grads_bufferr   should_comm_on_shard_dim_fsdp_group
fsdp_group'_convert_fake_replicate_grad_to_partialrv   r{   r  rc   r   r  r  r>   rR   r  )r   r  r  r  r  auto_dpr  fsdp_contextnew_params_gradsgrouprj   r  rR   s                r5   _apply_optimizezOptimizer._apply_optimize}  s    $&& 	+K 	F!## =	B,,22445577  7 7 !,:HXXZZ       0/11+ ;;===')$!-!<!J  $2>L ,33$)$7$C$)$6$B!"   $4L2CG@(7C 2  &4Bjj$   lD11 2'+|'D'D#'#A#A$d&9$ $LL !-[ 9I ,1:(22 2X. .2-K-K$X.0C. .L* == #'#E#E$o $F $ $LL $(#A#A$o $B $ $Lk7 7 7 7 7 7 7 7 7 7 7 7 7 7 7r #a''''j(G,,WoFF B B#33LAAB B B B B B B B B B B B B B Bs%   F?IIIJ11J58J5c                    |)t          |d          rt          |d          r|j        ||S d} fd} |||          }t          |d          r$|j        |                    |||j                  }n| ||||j                  }|J t                      rt	          j        ||g          S |}|j        t          j        j	        j
        k    r[|j                            |j        t          j                    z   |j        |j        |j        t          j        j	        j                  }d||gi}d|gi}|j                            d||           |S )	zpCreate and add backward regularization Operators

        Function helper of append_regularization_ops.
        Nrq   c                    | }| j         |j         k    rhj        o                    | j                   }|r+t          j                  dk    rj        | j                 }n|                     |j                   }|S r  )r   r~  ra  rJ   r   rl   astype)r^   rd  r  r  r   s       r5   get_target_paramzBOptimizer._create_regularization_of_grad.<locals>.get_target_param  s     L{dj(() A33EK@@   <3t';#<#<#A#A#'#7
#CLL#(<<
#;#;Lr4   )rl   r   r   r_  ry   rE  r"  sum)ry   rH  r#  )r   rq   r>   r   r   add_nry   r
   r  r  SELECTED_ROWSrc  rl   kNewGradSuffixr   r   r_  rt  r'  )	r   r^   rd  rc   regularization_termr  new_gradrH  r#  s	   `        r5   _create_regularization_of_gradz(Optimizer._create_regularization_of_grad  s    <E=11  E=11  7<6G6O&K"	  	  	  	  	  ! --5-(( 	JU->-J"'"3"3E4"L"L'"0.dj"I"I"...!## 	<': ;<<<HyDL0>>>
  :00T%8%:%::++#o-: 1   D"567Fxj)GJ  eFG LLLOr4   rj  c                @   g }t          j                    st                      r7|D ]3\  }}|                     |||          }|                    ||f           4nd}t          j        d          5  |D ]\  }}|s#|j        |d}t          j        d| d           |j	        j
                            ||g          5  |                     |||          }|                    ||f           ddd           n# 1 swxY w Y   	 ddd           n# 1 swxY w Y   |S )a  Create and add backward regularization Operators

        Creates and adds backward regularization operators in the BlockDesc.
        This will add gradients of the regularizer function to the gradients
        of the parameters and return these modified gradients. This is the
        same as implementing weight decay in optimizers for regularization.

        Args:
            parameters_and_grads (list[tuple[Tensor,Tensor]]): A list of (parameters, gradients) pairs
                that need to be regularized.
            regularization (WeightDecayRegularizer|None, optional): A global regularizer. If the parameter is not
                set. It will be applied with regularizer.

        Returns:
            list[tuple[Tensor,Tensor]]: list of (parameters, gradients) \
                pair with the regularized gradient

        Raises:
            Exception: Unknown regularization type
        Frc   NTzyIf regularizer of a Parameter has been set by 'base.ParamAttr' or 'base.WeightNormParamAttr' already. The Regularization[rr   )r   r   r   r	  rF   r   rq   r   r   r>   rR   r  )r   rj  rc   r\   r^   rd  r  repeat_regularizers           r5   r  z#Optimizer.append_regularization_ops  s&   2 $&& 	C+-- 	C3 ; ;t>>4  !''(9::::	; "'%&677 C C#7 C CKE4.	!-9*6-1*X2@X X X   ,==udmLL C C#'#F#F!4$ $ )//0ABBB	C C C C C C C C C C C C C C CCC C C C C C C C C C C C C C C"  s7   1AD?/C:.D:C>>DC>DDDc                   t                      rbt          |          }|j        j                                                                        }d |D             }|                    |           |S t          |          }|j        j                                                                        }d |D             }|                    |           |S )Nc                $    g | ]}|j         d u |S Tr  r  s     r5   r   z.Optimizer._get_no_grad_set.<locals>.<listcomp>G  s,     " " "1D1L1L1L1L1Lr4   c                .    h | ]}|j         d u |j        S r  )r   rl   r  s     r5   	<setcomp>z-Optimizer._get_no_grad_set.<locals>.<setcomp>P  s2     " " "&$.. 
...r4   )r   r   r>   rR   r?   r@   r  r   )r   r  rN   rg   param_no_trainables        r5   r  zOptimizer._get_no_grad_setC  s    == 	0==K+88::IIKKJ" "#-" " " 1222/<<K+88::IIKKJ" "'" " " 1222r4   Tset_to_zeroboolc                <   g }| j          t          | j         d         t                    s'| j         D ]}|j        s|                    |           n1| j        D ])}|d         D ]}|j        s|                    |           *|D ]}|                    |           dS )a  
        Clear the gradients of all optimized parameters for model.

        If not, new gradient will accumulat on previous gradient.

        There are two method to clear grad: set_to_zero or delete grad.

        Args:
            set_to_zero (bool, optional): If set grads to zero or not, default is True.

        Returns:
            None

        Examples:
            .. code-block:: python

                >>> import paddle

                >>> a = paddle.arange(26, dtype="float32").reshape([2, 13])
                >>> linear = paddle.nn.Linear(13, 5)
                >>> # This can be any optimizer supported by dygraph.
                >>> adam = paddle.optimizer.Adam(learning_rate = 0.01,
                ...                             parameters = linear.parameters())
                >>> out = linear(a)
                >>> out.backward()
                >>> adam.step()
                >>> adam.clear_grad()

        Nr   r,   )r|   rv   rz   r   rF   r   clear_gradient)r   r  
param_listr  r   s        r5   r   zOptimizer.clear_gradY  s    > 
'z #T0
 0
' ) ) ) )%%a((()  $1 - -$X. - -A? -"))!,,,-  	* 	*A[))))	* 	*r4   set_to_nonec                4    |                      |            d S )N)r  )r   )r   r  s     r5   	zero_gradzOptimizer.zero_grad  s    O44444r4   2tuple[list[Operator], list[tuple[Tensor, Tensor]]]c                    t          |t          t          j        j        f          s
J d            |r|n| j        }|                     ||||          }|                     |||          }||fS )aa  
        Add operations to minimize ``loss`` by updating ``parameters``.

        Args:
            loss (Tensor): A ``Tensor`` containing the value to minimize.
            startup_program (Program|None, optional): :ref:`api_paddle_static_Program` for
                initializing parameters in ``parameters``. The default value
                is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
            parameters (list[Tensor]|list[str]|None, optional): List of ``Tensor`` or ``Tensor.name`` to update
                to minimize ``loss``. The default value is None, at this time all parameters
                will be updated.
            no_grad_set (set[Tensor]|set[str]|None, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                to be updated. The default value is None.

        Returns:
            tuple[list[Operator],list[tuple[Tensor, Tensor]]], A list of operators appended
                by minimize and a list of (param, grad) tensor pairs, param is
                ``Parameter``, grad is the gradient value corresponding to the parameter.
                In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
                indicate program pruning. If so, the program will be pruned by ``feed`` and
                ``fetch_list`` before run, see details in ``Executor``.

        Examples:
            .. code-block:: python

                >>> import paddle
                >>> linear = paddle.nn.Linear(10, 10)
                >>> input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
                >>> out = linear(input)
                >>> loss = paddle.mean(out)

                >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
                >>> beta2 = paddle.to_tensor([0.99], dtype="float32")

                >>> adam = paddle.optimizer.Adam(learning_rate=0.1,
                ...         parameters=linear.parameters(),
                ...         weight_decay=0.01)
                >>> loss.backward()
                >>> adam.minimize(loss)
                >>> adam.clear_grad()

        zThe loss should be an Tensor.)r  rg   rN   )r  r  )rv   r   rw   r   r  r|   r  r  )r   r  r  rg   rN   rM   r  r  s           r5   minimizezOptimizer.minimize  s    d $6:+; <== 	
 	
+	
 	
= (2Kt7K}}+%#	 % 
 
 ++/ , 
 
 \))r4   c                   t           j                                                                                                        }t          | j        d         t                    r
J d            d | j        D             d |D             }t          t          fd|                    }d |D             }| 
                    |          }dS )zW
        In declarative mode, we forward `call step` to `call apply_gradients`
        r   zQOnly list of parameters is supported while using optimizer in @paddle.jit.static.c                    h | ]	}|j         
S r3   r  r  s     r5   r  z.Optimizer._declarative_step.<locals>.<setcomp>  s    HHH%5:HHHr4   c                     g | ]}|j         	|S r3   )	trainabler  s     r5   r   z/Optimizer._declarative_step.<locals>.<listcomp>  s    CCC5?CeCCCr4   c                6    | j         v ot          | d          S )Nrd  )rl   r   )r  selected_paramss    r5   ru   z-Optimizer._declarative_step.<locals>.<lambda>  s    !&O3J68J8J r4   c                     g | ]}||j         fS r3   )rd  r  s     r5   r   z/Optimizer._declarative_step.<locals>.<listcomp>  s    DDD
+DDDr4   N)rw   r   r   r?   r@   rv   r|   rz   r{   filterr  )r   r,   rg   r  r  r"  s        @r5   _declarative_stepzOptimizer._declarative_step  s    
 M..00==??NNPP 	 d215t<< 	
 	
_	
 	
< IH43GHHHCCCCC
JJJJ 
 

 EDDDD++L99r4   c                (   t           j        j        j                                        r|                                  dS t          | j        d         t                    sg }| j        D ]}|j        r
t          | dd          r4t          |d          r#|j        |                    ||j        f           Ot          |d          r$|j        |                    ||j        f           |                                +|                                }|                    ||f           |                     dd|d           dS t          | j                  D ]\  }}t!          d           }|d         D ]O}|j        r
|                                1|                                }|d                             ||f           P|                    d	 |                                D                        |                     dd||           dS )
a  
        Execute the optimizer and update parameters once.

        Returns:
            None

        Examples:
            .. code-block:: python

                >>> import paddle

                >>> a = paddle.arange(26, dtype="float32").reshape([2, 13])
                >>> linear = paddle.nn.Linear(13, 5)
                >>> # This can be any optimizer supported by dygraph.
                >>> adam = paddle.optimizer.Adam(learning_rate = 0.01,
                ...                         parameters = linear.parameters())
                >>> out = linear(a)
                >>> out.backward()
                >>> adam.step()
                >>> adam.clear_grad()
        Nr   enable_tensor_fusionF	main_grad)r  r  r  r  c                     g S rt   r3   r3   r4   r5   ru   z Optimizer.step.<locals>.<lambda>   s    2 r4   r,   c                &    i | ]\  }}|d k    ||S r  r3   r  s      r5   r  z"Optimizer.step.<locals>.<dictcomp>(  s#    KKKdaQ(]]Q]]]r4   )rw   r	  dygraphin_to_static_moder%  rv   r   rz   r   getattrr   r(  rF   
_grad_ivarr  rK   r   r  r   )r   r  r^   grad_varr  r   s         r5   stepzOptimizer.step  so   0 ;#5577 	""$$$F$,Q/66 -	L+ ? ?& 4!7?? ?{33F!O7$++UEO,DEEEE;//?49O4O ''(@AAAA''))5#(#3#3#5#5$++UH,=>>>   $) !	 !      %.d.@$A$A   [*::66(2 I IE* ! ''))5#(#3#3#5#5$X.55uh6GHHH##KKk&7&7&9&9KKK   $$$(!-$'	 %     r4   c                   |d         }t          |t                    r|g|d<   n6t          |t                    rt          d          t	          |          |d<   | j                                        D ]\  }}|                    ||           t                      }| j        D ]*}|	                    t          |d                              +|
                    t          |d                             st          d          |d         D ]V}|d         }t          |t                    rt          |          }	n|}	|	|_        |                    dd          |j        d<   W| j                            |           dS )z
        Add a param group to parameter_list.

        Args:
            param_group (dict): The group of Tensors to be optimized with
            different optimization options.
        r,   z`optimizer parameters should be in ordered collections,but received set, please use list instead.z7some parameters appear in more than one parameter groupr-   r.   r7  N)rv   r   r   rx   r{   r   r   
setdefaultr   r  
isdisjoint
ValueErrorr   r   rq   r   r6  rF   )
r   r   r,   r   r   	param_setr  r^   r-   rc   s
             r5   r   zOptimizer._add_param_group1  s    X&fi(( 	1%+HK!!$$ 	1=  
 %)LLK! &,,.. 	) 	)DAq""1a((((EE	' 	3 	3ESx112222##CH(=$>$>?? 	I   !* 		 		E&~6L,.. .!(!6!6!- .E3>??4 4E00 	!!+.....r4   c                    dS )z
        Update the param group with new entry
        Args:
            parameters (dict): The extra group of Tensors to be optimized with
            different optimization options. Only used in child class.
        Nr3   )r   rg   s     r5   r  zOptimizer._update_param_group^  rh  r4   c                    dS )a  
        All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
        This function will be overridden in the corresponding optimizer file.

        Args:
            target_block: the block in which the loss tensor is present
            parameters: list of parameter tensors for the optimizer
        Nr3   )r   r  rg   r  s       r5   r  zOptimizer._multi_tensor_initg  s	     	r4   c                    dS )zM
        For Multi Tensor, append optimize merged_operator to block.
        Nr3   )r   r  rj  r  s       r5   r  z*Optimizer._append_optimize_multi_tensor_ops  rh  r4   c                f   t          |t          j        j        t          j        f          s
J d            t          |t          j        j                  r4|t          j        j        j        k    p|t          j        j        j        k    S |t          j        j        k    p|t          j        j        k    S )z
        check the dtype is fp16 or the dtype is bf16
        :param dtype: instance of core.VarDesc.VarType
        :return: True if dtype is one of fp16 or bf16, False otherwise
        zIThe dtype should be an instance of core.VarDesc.VarType or core.DataType.)	rv   r
   r  r  r
  FP16BF16FLOAT16BFLOAT16)r   r   s     r5   ra  z Optimizer._is_dtype_fp16_or_bf16|  s     %$,"6!FGG 	
 	
W	
 	
G eT\122 		-22 6DL055 .. 3DM22r4   c                <    d| _         |                                  d S )NT)r   r   r   s    r5   use_fusion_storagezOptimizer.use_fusion_storage  s"    #' r4   c                    | j         | _        d S rt   )r   r   r   s    r5   r   zOptimizer.need_refuse  s     4r4   c                    d| _         d S r   )r   r   s    r5   r   zOptimizer.reset_need_refuse  s    !r4   c                    | j         S rt   )r   r   s    r5   fused_buffer_versionzOptimizer.fused_buffer_version  s    ((r4   c                ,    | j         d S | j         j        S rt   )r   bufferr   s    r5   r   zOptimizer.fused_states_buffer  s    &4"))r4   c                ,    | j         d S | j         j        S rt   )r   buffer_ipc_metar   s    r5   fused_states_buffer_ipc_metaz&Optimizer.fused_states_buffer_ipc_meta  s    &4"22r4   c                ,    | j         d S | j         j        S rt   )r   accumulators_metar   s    r5   fused_states_accumulators_metaz(Optimizer.fused_states_accumulators_meta  s    &4"44r4   c                ,    | j         d S | j         j        S rt   )r   master_weights_metar   s    r5    fused_states_master_weights_metaz*Optimizer.fused_states_master_weights_meta  s    &4"66r4   )NNNN)r.   rf   rg   rh   r-   ri   rj   rk   rl   rm   rn   ro   )rn   r   )r   r   rn   ro   )rn   r   )r   r   rn   ro   )r*  r!   rn   ro   )rn   r   rt   )Nrl  NNNr  )r  r&   r  r  rg   r  rN   r  rO   r  rn   r  )r  r  rn   r  )rj  r  rc   rb   rn   r  r  )r  r  rn   ro   )r  r  rn   ro   )NNN)
r  r&   r  r  rg   r  rN   r  rn   r  )rn   ro   )Ar/   r0   r1   __doc__r2   imperative_baseno_gradr   r   r   r   r   r   r   r   dygraph_onlyr   r   load_state_dictr   r  r)  r-  r/  r   r4  r<  rZ  rI  re  rg  rk  ry  r|  r  r  rs  r  r  r  r  r  r	  r  r  non_static_onlyr   r  r  r%  r0  r   r  r  r  ra  r?  r   r   propertyrC  r   rH  rK  rN  r3   r4   r5   ra   ra      s        I IV 2111""""++++_ LP>B-1v( v( v( v( v(p" " "( ( (
 
 
3 3 3   _$
 $
 $
L + + + +Z D8 D8 D8 D8L %O$ $ $ $A A AF A A A AF *( *( *( *(XY) Y) Y) Y)v
: 
: 
: 
:
 
 
0 0 02C C CJ. . .  6    " s s s sj4 4 4*5 5 54      56^3 ^3 ^3 ^3B 56H. H. H. H.Z +/6:596:i i i i iV0 0 0 0f DEP P P Pd= = = =D 9=3  3  3  3  3 j   , ,* ,* ,* ,* ,*\ 5 5 5 5 5 _ +/6:59B* B* B* B* B*H: : :* _G G G  GR+/ +/ +/Z   	 	 	     (  5 5 5" " " ) ) X) * * X*
 3 3 X3
 5 5 X5
 7 7 X7 7 7r4   ra   )NNNNN)J
__future__r   r   r   collectionsr   typingr   numpyr  rw   paddle.autogradr  rP  r   paddle._pir_opsr   r   paddle.autograd.backward_utilsr	   paddle.baser
   paddle.base.frameworkr   r   r   r   r   r   r   r   paddle.regularizerr   r   r	  r   r   base.backwardr   r   r   base.frameworkr   base.layer_helperr   r   base.log_helperr   r  r    r!   collections.abcr"   r#   typing_extensionsr$   r%   r&   paddle.nn.clipr'   r(   r)   r+   r/   INFOr   __all__r   environr   r  static_onlyr_   ra   r3   r4   r5   <module>rk     sJ  & # " " " " "  				 # # # # # #                  ) ) ) ) ) )       4 4 4 4 4 4 4 4 4 4 4 4 4 4      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? > > > > > > > ) ) ) ) ) ) ) )         
 ' & & & & & < < < < < < < < ( ( ( ( ( ( ( ( ( ( ( ( ( ( H2222222288888888//////22222222H H H H H9 H H H zgl H  
 #&3JNN91==$ $  
  , , , ,^o 7 o 7 o 7 o 7 o 7 o 7 o 7 o 7 o 7 o 7r4   