
    |j'y                    6   d dl mZ d dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlmZ d dlZddlmZ d	d
lmZ e	rd dlmZ d dlmZ d dlmZ d dlmZ d	dlmZ g Z G d de          Z G d de          Zd Zd ZddZ 	 	 	 	 ddZ! G d de          Z"dS )     )annotationsN)defaultdict)reduce)TYPE_CHECKINGNoReturn	TypedDict)NotRequired   )	framework   )	Optimizer)Sequence)Tensor)GradientClipBase)WeightDecayRegularizer)_ParameterConfigc                  ~    e Zd ZU ded<   ded<   ded<   ded<   ded<   ded	<   ded
<   ded<   ded<   ded<   ded<   dS )_LbfgsStateint
func_evalsn_iterr   dalphazlist[Tensor]old_ykold_skroH_diagprev_flat_gradfloat	prev_losszNotRequired[list[Tensor]]alN__name__
__module____qualname____annotations__     f/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/optimizer/lbfgs.pyr   r   *   s         OOOKKKIIIMMMNNN!!!!!!r(   r   c                      e Zd ZU ded<   dS )_LbfgsStateDictr   stateNr"   r'   r(   r)   r+   r+   8   s         r(   r+   c                     t           j                                        r.t          j        d          dk    rt          j        d           dS dS dS )z-Check and warn about TF32 acceleration statusNVIDIA_TF32_OVERRIDE0zWarning! TF32 Tensor Cores are enabled by default on some NVIDIA GPUs for faster computation, but may compromise numerical precision in specific cases, particularly with the L-BFGS optimizer.To disable it, set: NVIDIA_TF32_OVERRIDE=0N)paddledeviceis_compiled_with_cudaosgetenvwarningswarnr'   r(   r)   check_tf32_overrider7   <   sc     	++--
I,--449	
 	
 	
 	
 	
	
 
44r(   c                4    | |z                       d          S )z
    NOTE: This is a temporary workaround for unstable result computed by `paddle.dot`,
    which will be reverted when the problem is fixed."
    axis)sumxys     r)   dotr@   I   s    
 E;;B;r(   c                t   ||\  }}n| |k    r| |fn|| f\  }}||z   d||z
  z  | |z
  z  z
  }	|	dz  ||z  z
  }
|
dk    rs|
                                 }| |k    r||| z
  ||z   |	z
  ||z
  d|z  z   z  z  z
  }n| | |z
  ||z   |	z
  ||z
  d|z  z   z  z  z
  }t          t          ||          |          S ||z   dz  S )a]  Cubic interpolation between (x1, f1, g1) and (x2, f2, g2).
        Use two points and their gradient to determine a cubic function and get the minimum point
        between them in the cubic curve.

    Reference:
        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
        pp59: formula 3.59

    Args:
        x1, f1, g1: point1's position, value and gradient.
        x2, f2, g2: point2's position, value and gradient.
        bounds: bounds of interpolation area

    Returns:
        min_pos: the minimum point between the specified points in the cubic curve.
    N   r
   r   g       @)sqrtminmax)x1f1g1x2f2g2bounds
xmin_bound
xmax_boundd1	d2_squared2min_poss                r)   _cubic_interpolaterS   Q   s   $ !'
JJ-/2XX"bB8
J	b1R=BG,	,BARIA~~^^88BGb2"r'AF:J(KLLGGBGb2"r'AF:J(KLLG3w
++Z888Z'3..r(   -C6??&.>   c           
     B   |                                                                 }|                                } | |||          \  }}d}t          ||          }d|||f\  }}}}d}d}||
k     r||||z  |z  z   k    s|dk    r)||k    r#||g}||g}||                                g}||g}nt          |          | |z  k    r|g}|g}|g}d}n|dk    r#||g}||g}||                                g}||g}nt|d||z
  z  z   }|dz  }|}t	          ||||||||f          }|}|}|                                }|} | |||          \  }}|dz  }t          ||          }|dz  }||
k     ||
k    rd|g}||g}||g}d}|d         |d         k    rd	nd
\  }}|sc||
k     r\t          |d         |d         z
            |z  |	k     rn5t	          |d         |d         |d         |d         |d         |d                   }dt          |          t          |          z
  z  } t          t          |          |z
  |t          |          z
            | k     r|s&|t          |          k    s|t          |          k    rht          |t          |          z
            t          |t          |          z
            k     rt          |          | z
  }nt          |          | z   }d}nd}nd} | |||          \  }}|dz  }t          ||          }|dz  }||||z  |z  z   k    s|||         k    r@|||<   |||<   |                                ||<   |||<   |d         |d         k    rd	nd
\  }}nt          |          | |z  k    rd}nD|||         ||         z
  z  dk    r,||         ||<   ||         ||<   ||         ||<   ||         ||<   |||<   |||<   |                                ||<   |||<   |s||
k     \||         }||         }||         }||||fS )ag  Implements of line search algorithm that satisfies the strong Wolfe conditions using double zoom.

    Reference:
        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
        pp60: Algorithm 3.5 (Line Search Algorithm).

    Args:
        obj_func: the objective function to minimize. ```` accepts a multivariate input and returns a scalar.
        xk (Tensor): the starting point of the iterates.
        alpha (Scalar): the initial step size.
        d (Tensor): search direction.
        loss (scalar): the initial loss
        grad (Tensor): the initial grad
        c1 (Scalar): parameter for sufficient decrease condition.
        c2 (Scalar): parameter for curvature condition.
        tolerance_change (Scalar): terminates if the change of function value/position/parameter between
            two iterations is smaller than this value.
        max_ls(int): max iteration of line search.
        alpha_max (float): max step length.

    Returns:
        loss_new (Scaler): loss of obj_func at final alpha.
        grad_new, (Tensor): derivative of obj_func at final alpha.
        alpha(Tensor): optimal step length, or 0. if the line search algorithm did not converge.
        ls_func_evals (Scaler): number of objective function called in line search process.

    Following summarizes the essentials of the strong Wolfe line search algorithm.
    Some notations used in the description:

        - `func` denotes the objective function.
        - `obi_func` is a function of step size alpha, restricting `obj_func` on a line.

            obi_func = func(xk + alpha * d),
            where xk is the position of k'th iterate, d is the line search direction(decent direction),
            and a is the step size.
        - alpha : substitute of alpha
        - a1 is alpha of last iteration, which is alpha_(i-1).
        - a2 is alpha of current iteration, which is alpha_i.
        - a_lo is alpha in left position when calls zoom, which is alpha_low.
        - a_hi is alpha in right position when calls zoom, which is alpha_high.

    Line Search Algorithm:
        repeat
            Compute obi_func(a2) and derphi(a2).
            1. If obi_func(a2) > obi_func(0) + c_1 * a2 * obi_func'(0) or [obi_func(a2) >= obi_func(a1) and i > 1],
                alpha= zoom(a1, a2) and stop;

            2. If |obi_func'(a2)| <= -c_2 * obi_func'(0),
                alpha= a2 and stop;

            3. If obi_func'(a2) >= 0,
                alpha= zoom(a2, a1) and stop;

            a1 = a2
            a2 = min(2 * a2, a2)
            i = i + 1
        end(repeat)

    zoom(a_lo, a_hi) Algorithm:
        repeat
            aj = cubic_interpolation(a_lo, a_hi)
            Compute obi_func(aj) and derphi(aj).
            1. If obi_func(aj) > obi_func(0) + c_1 * aj * obi_func'(0) or obi_func(aj) >= obi_func(a_lo),
                then a_hi <- aj;
            2.
                2.1. If |obi_func'(aj)| <= -c_2 * obi_func'(0), then alpha= a2 and stop;

                2.2. If obi_func'(aj) * (a2 - a1) >= 0, then a_hi = a_lo

                a_lo = aj;
        end(repeat)

    reference: https://github.com/pytorch/pytorch
    r   r   FTg{Gz?
   )rL   r9   )r   r   )r   r   g?)absrE   cloner@   rS   rD   )!obj_funcxkr   r   lossgradgtdc1c2tolerance_changemax_lsd_normloss_newgrad_newls_func_evalsgtd_newt_prevf_prevg_prevgtd_prevdonels_iterbracket	bracket_f	bracket_gbracket_gtdmin_stepmax_steptmpinsuf_progresslow_poshigh_posepss!                                    r)   _strong_wolfer{   u   s\   p UUWW[[]]F::<<D!"eQ//HhM(AG )*4s';$FFFHDG
F

tb5j3..//aKKH..uoG*I!1!12I#W-Kw<<B39$$gG!
I!
IDa<<uoG*I!1!12I#W-K 456>222:"h'
 
 
 !!%Xb%33(h""1a F

f &e*8$	8$	
 N"+A,)B-"?"?VGX F+w''wqzGAJ&''&03CCC #AJaLNAJaLN
 
" S\\CLL01s7||e#US\\%9::S@@ &#g,,!6!6%3w<<:O:Ous7||+,,s53w<<3G/H/HHHLL3.EELL3.E!&!%"N%Xb%33(h""1 rEzC//009W--- !&GH"*Ih"*.."2"2Ih$+K!#A,)A,66F GXX 7||sSy((GH-0@@AQFF$+G$4!&/&8	(#&/&8	(#(3G(<H%  %GG!)Ig!)!1!1Ig#*K M  F+w''R GE!H!HXum33r(   c                       e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 d)d* fdZd+dZd,dZd  Zd! Zd" Z	d# Z
d$ Zej        d-d&            Z	 d.d/d(Z xZS )0LBFGSaJ  
    The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function.
    Closely related is the Newton method for minimization. Consider the iterate update formula:

    .. math::
        x_{k+1} = x_{k} + H_k \nabla{f_k}

    If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method.
    If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then
    it's a quasi-Newton. In practice, the approximated Hessians are obtained
    by only using the gradients, over either whole or part of the search
    history, the former is BFGS, the latter is L-BFGS.

    Reference:
        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS).

    Args:
        learning_rate (float, optional): learning rate .The default value is 1.
        max_iter (int, optional): maximal number of iterations per optimization step.
            The default value is 20.
        max_eval (int|None, optional): maximal number of function evaluations per optimization
            step. The default value is max_iter * 1.25.
        tolerance_grad (float, optional): termination tolerance on first order optimality
            The default value is 1e-5.
        tolerance_change (float, optional): termination tolerance on function
            value/parameter changes. The default value is 1e-9.
        history_size (int, optional): update history size. The default value is 100.
        line_search_fn (string|None, optional): either 'strong_wolfe' or None. The default value is strong_wolfe.
        parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. The default value is None.
        weight_decay (int|float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
            It can be a int or float value as coeff of L2 regularization or \
            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of \
            some derived class of ``GradientClipBase`` . There are three clipping strategies \
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        name (str|None, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    Return:
        loss (Tensor): the final loss of closure.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> import numpy as np

            >>> paddle.disable_static()
            >>> np.random.seed(0)
            >>> np_w = np.random.rand(1).astype(np.float32)
            >>> np_x = np.random.rand(1).astype(np.float32)

            >>> inputs = [np.random.rand(1).astype(np.float32) for i in range(10)]
            >>> # y = 2x
            >>> targets = [2 * x for x in inputs]

            >>> class Net(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...         w = paddle.to_tensor(np_w)
            ...         self.w = paddle.create_parameter(shape=w.shape, dtype=w.dtype, default_initializer=paddle.nn.initializer.Assign(w))
            ...
            ...     def forward(self, x):
            ...         return self.w * x
            ...
            >>> net = Net()
            >>> opt = paddle.optimizer.LBFGS(learning_rate=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters())
            >>> def train_step(inputs, targets):
            ...     def closure():
            ...         outputs = net(inputs)
            ...         loss = paddle.nn.functional.mse_loss(outputs, targets)
            ...         print('loss: ', loss.item())
            ...         opt.clear_grad()
            ...         loss.backward()
            ...         return loss
            ...     opt.step(closure)
            ...
            >>> for input_np, target_np in zip(inputs, targets):
            ...     input = paddle.to_tensor(input_np)
            ...     target = paddle.to_tensor(target_np)
            ...     train_step(input, target)
          ?   NHz>rV   d   learning_rater   max_iterr   max_eval
int | Nonetolerance_gradrc   history_sizeline_search_fn
str | None
parameters4Sequence[Tensor] | Sequence[_ParameterConfig] | Noneweight_decay%float | WeightDecayRegularizer | None	grad_clipGradientClipBase | NonenamereturnNonec                @   t                       ||dz  dz  }|| _        || _        || _        || _        || _        || _        || _        t          |t          j
                  rt          dt          |          z             t          t                    | _        t!                                          d||	|
|           t          | j        d         t                    s| j        | _        n't)          | j                  D ]\  }}|d         | _        d | _        d S )N      z^parameters argument given to the optimizer should be an iterable of Tensors or dicts, but got r~   )r   r   r   r   r   r   params)r7   r   r   r   r   rc   r   r   
isinstancer0   r   	TypeErrortyper   dictr,   super__init___parameter_list_params	enumerate_param_groups_numel_cache)selfr   r   r   r   rc   r   r   r   r   r   r   idxparam_group	__class__s                 r)   r   zLBFGS.__init__  s;    	!|q(H*  , 0(,j&-00 	<>B:>N>NO  
 !&&
!% 	 	
 	
 	
 $.q1488 	5/DLL$-d.@$A$A 5 5 [*84 r(   r+   c                z    i }| j                                         D ]\  }}|                    ||i           d|iS )ap  Returns the state of the optimizer as a :class:`dict`.

        Return:
            state, a dict holding current optimization state. Its content
            differs between optimizer classes.

        Examples:
            .. code-block:: python

                >>> import paddle

                >>> paddle.disable_static()

                >>> net = paddle.nn.Linear(10, 10)
                >>> opt = paddle.optimizer.LBFGS(
                ...     learning_rate=1,
                ...     max_iter=1,
                ...     max_eval=None,
                ...     tolerance_grad=1e-07,
                ...     tolerance_change=1e-09,
                ...     history_size=100,
                ...     line_search_fn='strong_wolfe',
                ...     parameters=net.parameters(),
                >>> )

                >>> def train_step(inputs, targets):
                ...     def closure():
                ...         outputs = net(inputs)
                ...         loss = paddle.nn.functional.mse_loss(outputs, targets)
                ...         opt.clear_grad()
                ...         loss.backward()
                ...         return loss
                ...
                ...     opt.step(closure)
                ...
                >>> inputs = paddle.rand([10, 10], dtype="float32")
                >>> targets = paddle.to_tensor([2 * x for x in inputs])

                >>> n_iter = 0
                >>> while n_iter < 20:
                ...     loss = train_step(inputs, targets)
                ...     n_iter = opt.state_dict()["state"]["func_evals"]
                ...     print("n_iter:", n_iter)
        r,   )r,   itemsupdate)r   packed_statekvs       r)   
state_dictzLBFGS.state_dict  sQ    \ J$$&& 	( 	(DAqA''''&&r(   c                V    | j         t          d | j        d          | _         | j         S )Nc                0    | |                                 z   S N)numel)totalps     r)   <lambda>zLBFGS._numel.<locals>.<lambda>+  s    !2 r(   r   )r   r   r   r   s    r)   _numelzLBFGS._numel'  s4    $ &22DL!! !D   r(   c                   g }| j         D ]b}|j        )t          j        |                              dg          }n|j                            dg          }|                    |           ct          j        |d          S )Nr9   r   r:   )r   r_   r0   
zeros_likereshapeappendconcat)r   viewsr   views       r)   _gather_flat_gradzLBFGS._gather_flat_grad0  s     	 	Av~(++33RD99v~~rd++LL}U++++r(   c           	     >   d}| j         D ]x}|j        g k    rt          d |j                  nd}t          j        |                    ||||z                                |j                  |z            |          }||z  }y||                                 k    sJ d S )Nr   c                    | |z  S r   r'   r=   s     r)   r   z!LBFGS._add_grad.<locals>.<lambda>>  s
    A r(   r   )r   shaper   r0   assignaddr   r   )r   r   	directionoffsetr   r   s         r)   	_add_gradzLBFGS._add_grad;  s     	 	A;<7b==F--qw777aEfv~56>>qwGG%O  	 A eOFF&&&&&&r(   c                $    d | j         D             S )Nc                6    g | ]}|                                 S r'   )r[   ).0r   s     r)   
<listcomp>z&LBFGS._clone_param.<locals>.<listcomp>I  s     000a		000r(   )r   r   s    r)   _clone_paramzLBFGS._clone_paramH  s    004<0000r(   c                f    t          | j        |          D ]\  }}t          j        ||           d S r   )zipr   r0   r   )r   params_datar   pdatas       r)   
_set_paramzLBFGS._set_paramK  s@    DL+66 	$ 	$HAuM%####	$ 	$r(   c                    |                      ||           t           |                      }|                                 }|                     |           ||fS r   )r   r   r   r   )r   closurer>   r   r   r^   	flat_grads          r)   _directional_evaluatezLBFGS._directional_evaluateO  sV    ua   WWYY**,,	Yr(   r   c           
     f    t          j                    5   t          j                               j        } j        } j        } j        } j        } j        } j	        } j
        }	|	                    dd           |	                    dd                        }
t          |
          }d}|	dxx         dz  cc<                                    }|                                                                |k    }|r|
cddd           S |	                    d          }|	                    d          }|	                    d          }|	                    d	          }|	                    d
          }|	                    d          }|	                    d          }|	                    d          }d}||k     r|dz  }|	dxx         dz  cc<   |	d         dk    r7|                                }g }g }g }t          j        d|
j                  }n+|                    |          }|                    t          j        ||j                            }t-          ||          }|dk    rt/          |          |k    r?|                    d           |                    d           |                    d           |                    |           |                    |           |                    d|z             |t-          ||          z  }t/          |          }d|	vr	dg|z  |	d<   |	d         }|                                }t5          |dz
  dd          D ]\}t-          ||         |          ||         z  ||<   t          j        |                    ||         ||          z            |           ]t          j        ||          x}}t5          |          D ][}t-          ||         |          ||         z  } t          j        |                    ||         ||         | z
  z            |           \||                                }nt          j        ||           |}|	d         dk    r;t=          dd|                                                                z            |z  }n|}t-          ||          }!|!| k    rnd}"||dk    rtA          d           !                                }# fd}$tE          |$|#|||||!          \  }}}}" #                    ||           |                                                                |k    }n #                    ||           ||k    rt          j                    5  t                                 }ddd           n# 1 swxY w Y                                    }|                                                                |k    }d}"||"z  }|	dxx         |"z  cc<   |rnZ||z                                                                  |k    rn,t          ||z
            |k     rn||k    rn||k    rn||k     ||	d<   ||	d<   ||	d<   ||	d	<   ||	d
<   ||	d<   ||	d<   ||	d<   ddd           n# 1 swxY w Y   |
S )a  Performs a single optimization step.

        Args:
            closure (callable): A closure that reevaluates the model
            and returns the loss.

        Examples:
            .. code-block:: python

                >>> import paddle

                >>> paddle.disable_static()

                >>> inputs = paddle.rand([10, 10], dtype="float32")
                >>> targets = paddle.to_tensor([2 * x for x in inputs])

                >>> net = paddle.nn.Linear(10, 10)
                >>> opt = paddle.optimizer.LBFGS(
                ...     learning_rate=1,
                ...     max_iter=1,
                ...     max_eval=None,
                ...     tolerance_grad=1e-07,
                ...     tolerance_change=1e-09,
                ...     history_size=100,
                ...     line_search_fn='strong_wolfe',
                ...     parameters=net.parameters(),
                >>> )

                >>> def closure():
                ...     outputs = net(inputs)
                ...     loss = paddle.nn.functional.mse_loss(outputs, targets)
                ...     print("loss:", loss.item())
                ...     opt.clear_grad()
                ...     loss.backward()
                ...     return loss
                ...
                >>> opt.step(closure)
        r   r   r   r   Nr   r   r   r   r   r   r   r    r~   )dtypeg|=r!   r9   strong_wolfez only 'strong_wolfe' is supportedc                4                         | ||          S r   )r   )r>   r   r   r   r   s      r)   r\   zLBFGS.step.<locals>.obj_func  s$    #'#=#= 'E1$ $ r(   )$r0   no_gradenable_gradr   r   r   r   rc   r   r   r,   
setdefaultr   r   rZ   rE   getneg	to_tensorr   subtractmultiplyr@   lenpopr   ranger   r   r[   rD   r<   RuntimeErrorr   r{   r   )%r   r   r   r   r   r   rc   r   r   r,   	orig_lossr^   current_evalsr   opt_condr   r   r   r   r   r   r   r    r   r?   sysnum_oldr!   qirbe_ir`   rh   x_initr\   s%   ``                                   r)   stepz
LBFGS.stepV  s   R ^ s	+ s	+*f(**733G .M}H}H!0N#4!0N,LJE\1---Xq)))  		I##DM,1$..00I }}**,,>H  ! 7s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+< 		#AIIg&&EYYx((FYYx((F4BYYx((F"YY'788N		+..IF8##!h1$
 ?a''!AFFB#-cIIIFF "**>::A

6#3E#I#I#IJJAQBEzzv;;,66"JJqMMM"JJqMMMFF1III a(((a(((		#(+++ "$c!Qii "&kkG5(('+f|&;dtB "A"7Q;B77 F F #F1Iq 1 1BqE 91aeeF1I"Q%,@&A&A1EEEE #OAv666A"7^^ L L"6!9a002a58aeeF1IA,F&G&GKKKK!)%.__%6%6NNM)^<<< 	 ?a''Cy}}':':'<'<!<==M E *E )Q'' **** !"!-%77*+MNNN!%!2!2!4!4     
 AN$feQiA A=i NN5!,,,(}}2244FHH NN5!,,,))#/11 4 4#(#3#3D4 4 4 4 4 4 4 4 4 4 4 4 4 4 4$($:$:$<$<	#,==??#6#6#8#8N#J() .l###}4###   I??$$((**.>>>ti'((+;;; !H,,X%%C 8##F E#J"E'N$E(O$E(OE$K$E(O&4E"#!*E+gs	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+j s>   C-Z&RZ&V5)Z&5V9	9Z&<V9	=CZ&&Z*-Z*r   c                     t          d          )z}Empty method. LBFGS optimizer does not use this way to minimize ``loss``. Please refer 'Examples' of LBFGS() above for usage.zeLBFGS optimizer does not use this way to minimize loss. Please refer 'Examples' of LBFGS() for usage.)NotImplementedError)r   r^   startup_programr   no_grad_sets        r)   minimizezLBFGS.minimize6  s     "s
 
 	
r(   )r~   r   Nr   rV   r   NNNNN)r   r   r   r   r   r   r   r   rc   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r+   )r   r   )r   r   )NNN)r   r   )r#   r$   r%   __doc__r   r   r   r   r   r   r   r   r   non_static_onlyr   r   __classcell__)r   s   @r)   r}   r}   e  s0       X Xx  ## $"&%)KO>B-11! 1! 1! 1! 1! 1! 1!f2' 2' 2' 2'h! ! ! !, , ,' ' '1 1 1$ $ $   ] ] ] ]@ HL
 
 
 
 
 
 
 
 
r(   r}   r   )rT   rU   rV   rW   )#
__future__r   r3   r5   collectionsr   	functoolsr   typingr   r   r   typing_extensionsr	   r0   baser   	optimizerr   collections.abcr   r   paddle.nn.clipr   paddle.regularizerr   r   __all__r   r+   r7   r@   rS   r{   r}   r'   r(   r)   <module>r     s   # " " " " " 				  # # # # # #       5 5 5 5 5 5 5 5 5 5 ) ) ) ) ) )                    ,((((((//////999999++++++
" " " " ") " " "    i   

 

 

     !/ !/ !/ !/X 
m4 m4 m4 m4`W
 W
 W
 W
 W
I W
 W
 W
 W
 W
r(   