
    zjC                       d dl mZ d dlmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZ dd	lmZ er-d d
lmZmZ d dlmZ d dlmZ d dlmZ d dlmZ  e
dd          Z eddd           G d de                      ZdS )    )annotations)defaultdict)reduce)TYPE_CHECKINGAnyLiteralTypeVarN)	Optimizer)
deprecated   )_strong_wolfe)CallableSequence)Tensor)GradientClipBase)_ParameterConfig)WeightDecayRegularizer_T_coT)	covariantz2.5.0zpaddle.optimizer.LBFGS)since	update_tolevelc                       e Zd ZU dZded<   ded<   ded<   ded<   ded<   ded	<   d
ed<   ded<   	 	 	 	 	 	 	 	 	 	 	 d+d, fdZd-d Zd! Zd" Zd# Z	d$ Z
d% Zd& Zd.d*Z xZS )/LBFGSa.  
    The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function.
    Closely related is the Newton method for minimization. Consider the iterate update formula:

    .. math::
        x_{k+1} = x_{k} + H_k \nabla{f_k}

    If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method.
    If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then
    it's a quasi-Newton. In practice, the approximated Hessians are obtained
    by only using the gradients, over either whole or part of the search
    history, the former is BFGS, the latter is L-BFGS.

    Reference:
        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS).

    Args:
        learning_rate (float, optional): learning rate .The default value is 1.
        max_iter (int, optional): maximal number of iterations per optimization step.
            The default value is 20.
        max_eval (int, optional): maximal number of function evaluations per optimization
            step. The default value is max_iter * 1.25.
        tolerance_grad (float, optional): termination tolerance on first order optimality
            The default value is 1e-5.
        tolerance_change (float, optional): termination tolerance on function
            value/parameter changes. The default value is 1e-9.
        history_size (int, optional): update history size. The default value is 100.
        line_search_fn (string, optional): either 'strong_wolfe' or None. The default value is strong_wolfe.
        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. The default value is None.
        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
            It canbe a float value as coeff of L2 regularization or \
            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of \
            some derived class of ``GradientClipBase`` . There are three clipping strategies \
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    Return:
        loss (Tensor): the final loss of closure.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> import numpy as np
            >>> from paddle.incubate.optimizer import LBFGS

            >>> paddle.disable_static()
            >>> np.random.seed(0)
            >>> np_w = np.random.rand(1).astype(np.float32)
            >>> np_x = np.random.rand(1).astype(np.float32)

            >>> inputs = [np.random.rand(1).astype(np.float32) for i in range(10)]
            >>> # y = 2x
            >>> targets = [2 * x for x in inputs]

            >>> class Net(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...         w = paddle.to_tensor(np_w)
            ...         self.w = paddle.create_parameter(shape=w.shape, dtype=w.dtype, default_initializer=paddle.nn.initializer.Assign(w))
            ...     def forward(self, x):
            ...         return self.w * x

            >>> net = Net()
            >>> opt = LBFGS(learning_rate=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters())
            >>> def train_step(inputs, targets):
            ...     def closure():
            ...         outputs = net(inputs)
            ...         loss = paddle.nn.functional.mse_loss(outputs, targets)
            ...         print('loss: ', loss.item())
            ...         opt.clear_grad()
            ...         loss.backward()
            ...         return loss
            ...     opt.step(closure)

            >>> for input, target in zip(inputs, targets):
            ...     input_tensor = paddle.to_tensor(input)
            ...     target_tensor = paddle.to_tensor(target)
            ...     train_step(input_tensor, target_tensor)

    floatlearning_rateintmax_itermax_evaltolerance_gradtolerance_changehistory_sizeLiteral['strong_wolfe'] | Noneline_search_fndict[str, dict[str, Any]]state      ?   NHz>&.>d   
int | None
parameters4Sequence[Tensor] | Sequence[_ParameterConfig] | Noneweight_decay%float | WeightDecayRegularizer | None	grad_clipGradientClipBase | Nonename
str | Nonereturnr   c                $   ||dz  dz  }|| _         || _        || _        || _        || _        || _        || _        t          |t          j	                  rt          dt          |          z             t          t                    | _        t                                          d||	|
|           t          | j        d         t                    s| j        | _        n't'          | j                  D ]\  }}|d         | _        d | _        d S )N      z^parameters argument given to the optimizer should be an iterable of Tensors or dicts, but got r'   )r   r-   r/   r1   r3   r   params)r   r   r   r    r!   r"   r$   
isinstancepaddler   	TypeErrortyper   dictr&   super__init___parameter_list_params	enumerate_param_groups_numel_cache)selfr   r   r   r    r!   r"   r$   r-   r/   r1   r3   idxparam_group	__class__s                 o/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/incubate/optimizer/lbfgs.pyr@   zLBFGS.__init__   s/    !|q(H*  , 0(,j&-00 	<>B:>N>NO  
 !&&
!% 	 	
 	
 	
 $.q1488 	5/DLL$-d.@$A$A 5 5 [*84     c                z    i }| j                                         D ]\  }}|                    ||i           d|iS )zReturns the state of the optimizer as a :class:`dict`.

        Return:
            state, a dict holding current optimization state. Its content
                differs between optimizer classes.
        r&   )r&   itemsupdate)rF   packed_statekvs       rJ   
state_dictzLBFGS.state_dict   sP     J$$&& 	( 	(DAqA''''&&rK   c                V    | j         t          d | j        d          | _         | j         S )Nc                0    | |                                 z   S N)numel)totalps     rJ   <lambda>zLBFGS._numel.<locals>.<lambda>   s    !2 rK   r   )rE   r   rB   rF   s    rJ   _numelzLBFGS._numel   s4    $ &22DL!! !D   rK   c                   g }| j         D ]b}|j        )t          j        |                              dg          }n|j                            dg          }|                    |           ct          j        |d          S )Nr   )axis)rB   gradr;   
zeros_likereshapeappendconcat)rF   viewsrX   views       rJ   _gather_flat_gradzLBFGS._gather_flat_grad   s     	 	Av~(++33RD99v~~rd++LL}U++++rK   c           	     $   d}| j         D ]k}t          d |j                  }t          j        |                    ||||z                                |j                  |z            |          }||z  }l||                                 k    sJ d S )Nr   c                    | |z  S rU    )xys     rJ   rY   z!LBFGS._add_grad.<locals>.<lambda>   s
    A rK   )rB   r   shaper;   assignaddra   r[   )rF   alpha	directionoffsetrX   rV   s         rJ   	_add_gradzLBFGS._add_grad   s     	 	A--qw77Efv~56>>qwGG%O  	 A eOFF&&&&&&rK   c                $    d | j         D             S )Nc                6    g | ]}|                                 S ri   )clone).0rX   s     rJ   
<listcomp>z&LBFGS._clone_param.<locals>.<listcomp>   s     000a		000rK   )rB   rZ   s    rJ   _clone_paramzLBFGS._clone_param   s    004<0000rK   c                f    t          | j        |          D ]\  }}t          j        ||           d S rU   )ziprB   r;   rm   )rF   params_datarX   pdatas       rJ   
_set_paramzLBFGS._set_param   s@    DL+66 	$ 	$HAuM%####	$ 	$rK   c                    |                      ||           t           |                      }|                                 }|                     |           ||fS rU   )rr   r   rf   r}   )rF   closurerj   ro   dloss	flat_grads          rJ   _directional_evaluatezLBFGS._directional_evaluate   sV    ua   WWYY**,,	YrK   r   Callable[[], _T_co]r   c           
         t          j                    5   t          j                               j        } j        } j        } j        } j        } j        } j	        } j
        }	|	                    dd           |	                    dd                        }
t          |
          }d}|	dxx         dz  cc<                                    }|                                                                |k    }|r|
cddd           S |	                    d          }|	                    d          }|	                    d          }|	                    d	          }|	                    d
          }|	                    d          }|	                    d          }|	                    d          }d}||k     r|dz  }|	dxx         dz  cc<   |	d         dk    r7|                                }g }g }g }t          j        d|
j                  }n?|                    |          }|                    t          j        ||j                            }|                    |          }|dk    rt/          |          |k    r?|                    d           |                    d           |                    d           |                    |           |                    |           |                    d|z             ||                    |          z  }t/          |          }d|	vr	dg|z  |	d<   |	d         }|                                }t5          |dz
  dd          D ]a}||                             |          ||         z  ||<   t          j        |                    ||         ||          z            |           bt          j        ||          x}}t5          |          D ]`}||                             |          ||         z  } t          j        |                    ||         ||         | z
  z            |           a||                                }nt          j        ||           |}|	d         dk    r;t=          dd|                                                                z            |z  }n|}|                    |          }!|!| k    rnd}"||dk    rtA          d           !                                }# fd}$tE          |$|#|||||!          \  }}}}" #                    ||           |                                                                |k    }n #                    ||           ||k    rt          j                    5  t                                 }ddd           n# 1 swxY w Y                                    }|                                                                |k    }d}"||"z  }|	dxx         |"z  cc<   |rnZ||z                                                                  |k    rn,t          ||z
            |k     rn||k    rn||k    rn||k     ||	d<   ||	d<   ||	d<   ||	d	<   ||	d
<   ||	d<   ||	d<   ||	d<   ddd           n# 1 swxY w Y   |
S )z
        Performs a single optimization step.

        Args:
            closure (callable): A closure that reevaluates the model
                and returns the loss.

        
func_evalsr   n_iterr   Nr   ro   old_ykold_skroH_diagprev_flat_grad	prev_lossr'   )dtypeg|=alr]   strong_wolfez only 'strong_wolfe' is supportedc                4                         | ||          S rU   )r   )rj   ro   r   r   rF   s      rJ   obj_funczLBFGS.step.<locals>.obj_func  s$    #'#=#= 'E1$ $ rK   )$r;   no_gradenable_gradr   r   r   r    r!   r$   r"   r&   
setdefaultr   rf   absmaxgetneg	to_tensorr   subtractmultiplydotlenpoprb   rangerm   rn   ru   minsumRuntimeErrorrx   r   rr   )%rF   r   r   r   r   r    r!   r$   r"   r&   	orig_lossr   current_evalsr   opt_condr   ro   r   r   r   r   r   r   r   rk   sysnum_oldr   qirbe_igtdls_func_evalsx_initr   s%   ``                                   rJ   stepz
LBFGS.step   s    ^ s	+ s	+*f(**733G .M}H}H!0N#4!0N,LJE\1---Xq)))  		I##DM,1$..00I }}**,,>H  ! 7s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+< 		#AIIg&&EYYx((FYYx((F4BYYx((F"YY'788N		+..IF8##!h1$
 ?a''!AFFB#-cIIIFF "**>::A

6#3E#I#I#IJJAqBEzzv;;,66"JJqMMM"JJqMMMFF1III a(((a(((		#(+++ "$aeeAhh "&kkG5(('+f|&;dtB "A"7Q;B77 F F &q	a 0 02a5 81aeeF1I"Q%,@&A&A1EEEE #OAv666A"7^^ L L%ay}}Q//"Q%7aeeF1IA,F&G&GKKKK!)%.__%6%6NNM)^<<< 	 ?a''Cy}}':':'<'<!<==M E *E  mmA&& **** !"!-%77*+MNNN!%!2!2!4!4     
 AN$feQiA A=i NN5!,,,(}}2244FHH NN5!,,,))#/11 4 4#(#3#3D4 4 4 4 4 4 4 4 4 4 4 4 4 4 4$($:$:$<$<	#,==??#6#6#8#8N#J() .l###}4###   I??$$((**.>>>ti'((+;;; !H,,X%%C 8##F E#J"E'N$E(O$E(OE$K$E(O&4E"#!*E+gs	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+ s	+j s>   C-Z?RZ?*WZ?W	Z?W	CZ??[[)r'   r(   Nr)   r*   r+   NNNNN)r   r   r   r   r   r,   r    r   r!   r   r"   r   r$   r#   r-   r.   r/   r0   r1   r2   r3   r4   r5   r   )r5   r%   )r   r   r5   r   )__name__
__module____qualname____doc____annotations__r@   rR   r[   rf   rr   rx   r}   r   r   __classcell__)rI   s   @rJ   r   r   %   s[        Y Yv MMMMMM2222$$$$  ## $"&9=KO>B-1/! /! /! /! /! /! /!b' ' ' '! ! !, , ,' ' '1 1 1$ $ $         rK   r   )
__future__r   collectionsr   	functoolsr   typingr   r   r   r	   r;   paddle.optimizerr
   paddle.utilsr   line_search_dygraphr   collections.abcr   r   r   paddle.nn.clipr   paddle.optimizer.optimizerr   paddle.regularizerr   r   r   ri   rK   rJ   <module>r      sz   # " " " " " # # # # # #       7 7 7 7 7 7 7 7 7 7 7 7  & & & & & & # # # # # # . . . . . . -22222222//////;;;;;;999999GGt,,,E '%=QGGGR R R R RI R R HGR R RrK   