
    zj`                        U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	Z	d dl	m
Z
 d dlmZ d dlmZ d dlmZ erd dlmZ ej        j        d	ej        j        d
ej        j        d
ej        j        dej        j        dej        j        d	ej        j        d	ej        j        d	ej        j        d
ej        j        dej        j        dej        j        d	ej        j         dej        j!        diZ"g dZ#de$d<   ddgZ%de$d<   g Z&de$d<   g dZ'de$d<   g dZ(de$d<   dgZ)de$d<   dZ*dZ+dZ,d Z- G d d           Z. G d! d"          Z/	 dCdDd0Z0dEd4Z1d5 Z2d6 Z3i di fd7Z4d8 Z5d9 Z6d: Z7d; Z8d< Z9d= Z:d> Z;d? Z<d@ Z=dA Z>dB Z?dS )F    )annotationsN)TYPE_CHECKING)pir)backward_utils)core)in_cinn_debug_mode)Sequence               )Mpd_op.full_int_array
pd_op.fullzpd_op.dividezpd_op.subtractz	pd_op.addzpd_op.multiplyzpd_op.elementwise_powzpd_op.rsqrtzpd_op.reshapezpd_op.full_likezpd_op.assignzpd_op.expandzpd_op.scalez	pd_op.expz	pd_op.sinz	pd_op.coszpd_op.add_nz
pd_op.castzpd_op.concatzpd_op.full_with_tensorzpd_op.gather_ndzpd_op.logical_andzpd_op.logical_notzpd_op.wherez	pd_op.powzpd_op.shapezpd_op.shape64zpd_op.slicezpd_op.squeezezpd_op.unsqueezezpd_op.transposez	pd_op.logzpd_op.log1pzpd_op.logitzpd_op.expand_aszpd_op.splitzpd_op.arangezpd_op.put_along_axisz
pd_op.tanhz
pd_op.atanzpd_op.atanhz
pd_op.sinhz
pd_op.asinzpd_op.asinhz
pd_op.coshz
pd_op.acoszpd_op.acoshz	pd_op.absz
pd_op.signzpd_op.expm1z	pd_op.erfzpd_op.erfinvz
pd_op.ceilzpd_op.floorz
pd_op.fraczpd_op.roundzpd_op.trunczpd_op.anglezpd_op.as_complexzpd_op.as_realzpd_op.complexz
pd_op.realz
pd_op.imagz
pd_op.conjzpd_op.greater_equalzpd_op.greater_thanzpd_op.not_equalzpd_op.equalzpd_op.less_equalzpd_op.less_thanzpd_op.bitwise_andzpd_op.bitwise_orzpd_op.bitwise_xorzpd_op.bitwise_notzpd_op.isinfzpd_op.isnanzpd_op.sigmoidz	list[str]DEFAULT_RECOMPUTABLE_OPSr   r   TENDING_TO_RECOMPUTE_OPSVIEW_OPS)zpd_op.randintzpd_op.uniformzpd_op.dropout
RANDOM_OPS)zpd_op.matmulzpd_op.conv2dzpd_op.layer_normzpd_op.batchnormzpd_op.softmaxzpd_op.all_reduce_zpd_op.c_broadcast_zpd_op.reduce_COMPUTE_INTENSIVE_OPSzcf.stack_create
IGNORE_OPSF   g?c                     t          j        d          }|r1t          |                                          dv rt	          | ddi d S d S d S )N FLAGS_print_auto_recompute_debug)1trueflushT)osgetenvstrlowerprint)argsflags     n/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/decomposition/recompute.py
DebugPrintr%      s]    9788D !D		!!]22t 4     ! !22    c                       e Zd Zd Zd Zd ZdS )JudgeFusionLoopc                    |                                 j        | _        || _        d | j        D             | _        d | j        D             | _        |                                  d S )Nc                ,    i | ]}|t                      S  set.0ops     r$   
<dictcomp>z,JudgeFusionLoop.__init__.<locals>.<dictcomp>   s    1O1O1O"cee1O1O1Or&   c                ,    i | ]}|t                      S r+   r,   r.   s     r$   r1   z,JudgeFusionLoop.__init__.<locals>.<dictcomp>   s    /M/M/MbCEE/M/M/Mr&   )global_blockopsunrecomputable_ops!downstream_unrecomputable_ops_mapupstream_unrecomputable_ops_map_set_has_unfusible_on_path_map)selfprogramr5   s      r$   __init__zJudgeFusionLoop.__init__   sh    ''))-"41O1Odh1O1O1O./M/MDH/M/M/M,++-----r&   c                    fdfdfdd  fd} fd} j         D ] } j        |xx          ||          z  cc<   !t           j                   D ] } j        |xx          ||          z  cc<   !d S )Nc                B    t                      }g } |||            |S Nr,   )r0   defined_valuesused_values_get_used_external_value_impls      r$   _get_used_external_valuezPJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_used_external_value   s,     UUNK)).+rJJJr&   c                *   |                                 D ]0}|| vr*|                    |           |                     |           1|                                D ]]}|                                D ]}|                     |           |                                D ]\  }}|                     |           ^|                                D ]}|j        D ]} 	| ||           |                                D ]}|                     |           d S r>   )operands_sourceappendaddblocksr"   kwargsr4   results)
r?   r@   r0   operandblockvalue_inner_opresult_valuerA   s
            r$   rA   zUJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_used_external_value_impl   sV   --// 0 0.00&&w///"&&w/// . ."ZZ\\ . .E"&&u---- % . .HAu"&&u----.   %	  H11&X    !#

 1 1""<00001 1r&   c                   t                      } |           D ]j}|                                |                                }|                                |                                 k    r|                    |           k|S r>   )r-   get_defining_opget_parent_blockrF   )r0   	producersrJ   	source_oprB   s       r$   _get_producer_opszIJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_producer_ops   s    I33B77 - -**,,4#3355	--//23F3F3H3HHHMM),,,r&   c                    t                      }|                                 D ]0}|                                D ]}||                    |           1|S r>   )r-   rI   all_used_ops_in_same_blockrF   )r0   	consumersresult	parent_ops       r$   _get_consumer_opszIJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_consumer_ops   sc    I**,, 1 1!'!B!B!D!D 1 1I ,!i0001 r&   c                    t                      } |           D ]}|j        |         z  }|                                 j        v r|                    |            |S r>   )r-   r7   namer5   rF   )curupstream_unrecomputable_opsnew_oprU   r9   s      r$   _get_upstream_ops_recursivelyzUJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_upstream_ops_recursively   sr    *-%%'++C00  +8@++ xxzzT444+//444..r&   c                    t                      } |           D ]}|j        |         z  }|                                 j        v r|                    |            |S r>   )r-   r6   r]   r5   rF   )r^   downstream_unrecomputable_opsr`   r[   r9   s      r$   _get_downstream_ops_recursivelyzWJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_downstream_ops_recursively   sr    ,/EE)++C00  -:6B-- xxzzT444-11#66600r&   )r4   r7   reversedr6   )r9   ra   rd   r0   r[   rU   rB   rA   s   `   @@@@r$   r8   z.JudgeFusionLoop._set_has_unfusible_on_path_map   s5   	 	 	 	 		1 	1 	1 	1 	1$	 	 	 	 		 	 		/ 	/ 	/ 	/ 	/ 	/	1 	1 	1 	1 	1 	1 ( 	 	B0444--b114444 48$$ 	 	B22666//336666	 	r&   c                    t          | j        |         | j        |         z            dk    o+t          | j        |         | j        |         z            dk    }||| ndS )Nr   F)lenr6   r7   )r9   op1op2no_unfusible_op_on_paths       r$   _has_unfusible_op_on_any_pathz-JudgeFusionLoop._has_unfusible_op_on_any_path   s    6s;6s;<  	 	
 6s;6s;<  	 	  3? (''	
r&   N)__name__
__module____qualname__r;   r8   rk   r+   r&   r$   r(   r(      sE        . . .F F FP
 
 
 
 
r&   r(   c                      e Zd Zd Zd ZdS )	Op2IdxMapc                    i | _         t          |                                j                  D ]\  }}|| j         |<   d S r>   )op_to_idx_map	enumerater3   r4   )r9   r:   idxop_iters       r$   r;   zOp2IdxMap.__init__  sN    %g&:&:&<&<&@AA 	. 	.LC*-Dw''	. 	.r&   c                p    | j                             |d           r| j         |         S t          d          )Nzop not found in program)rr   getRuntimeError)r9   r0   s     r$   get_idxzOp2IdxMap.get_idx  s:    !!"d++ 	*%b))4555r&   N)rl   rm   rn   r;   ry   r+   r&   r$   rp   rp     s2        . . .
6 6 6 6 6r&   rp   r:   paddle.static.PrograminputsSequence[pir.Value]outputsgrad_outputsfwd_op_end_idxintbackward_op_start_idxrecomputable_opsSequence[str] | Nonereturn!tuple[paddle.static.Program, int]c           	       &'()*+,-./0123 t          d|            ddl}t          j                    }t          | ||          \  /}	}
t	          |	          dk    s*|t	          |                                 j                  k    r| |fS |                                 j        }t          |/          *t          }t          3|3z  }t                    nt          |          t          }t          }t          0||z   2t          |          z  ,t          j        |          }t          j        |          }||z  }i }|                                .t#          | 2          }t          |                                 j        d|dz                      +d } || |          ),fd&&)+/3fd(&)+fd''*0fd	}(*02fd
}/|	z  |
z  D ]-}|                                s|                                                                dk    rC|                                                                t*          v rqt	          |                                          dk    r/|                                d                                         dv r||	v rNt          d|j        ddd           .                    |j        dz   dt2          j                   |||j        <   ||v rLt          ddd|j        d           .                    d|j        dz   t2          j                   |||j        <    ||          rP|/v rLt          ddd|j        d           .                    d|j        dz   t2          j                   |||j        <    |||          }.                    |j        dz   |j        dz   |           |||j        <   t7          |)d+          }|D ]Q}t          d|j        d|j        d           .                    |j        dz   |j        dz   t2          j                   R|                                D ]}|+v r|                    |                                |          rt          ddd|j        d           .                    d|j        dz   t2          j                   t          d|j        ddd           .                    |j        dz   dt2          j                   /|                    .dd          \  }}t          d|           |\  }-t                      }.fd|D             D ]'\  1}|                    -1fd|D                        (t          j                    }|D ]B\  }} |dd         | dd         k    sJ ||dd                  }|                    |           C|}!|}!tA          | |!||)||          \  }"}#t          d|"           t          j                    }$tC                      rOtE          j#        d           }%|%$                    tD          j%                   |%&                    d!|$|z
   d"           |"|#fS )#a  
    Considering the compiler fuse strategy, we model the pir graph.
    Convert the pir calculation graph into a networkx calculation
    graph. Find the cut point through the min-cut algorithm,
    which is the value to be saved in pir forward calculation graph.

    Recompute the forward computation graph to replace intermediate
    variables in the forward graph held by the backward graph.

    .. warning::
        This API is experimental and likely to change.

    Args:
        program (Program): The program to be recomputed.
        inputs:(list[Value]|tuple(Value)): The input Values
            of the forward graph.
        outputs:(list[Value]|tuple(Value)): The out Values
            of the forward graph.
        grad_outputs:(list[Value]|tuple(Value)): initial gradient values
            of `outputs` .
        forward_op_end_idx(int): The index of the last forward op.
        backward_op_start_idx(int): The index of the start backward op.
        recomputable_ops(list[str]|tuple(str)|None): The op names that can
            be recomputed. If 'recompute_ops' is None, we will use the
            default recomputable_ops. Default None.
    Returns:
        recomputed_program(Program): The recomputed program.
        fwd_op_end_idx(int): The index of the last forward op in recomputed program.

    Examples:
        .. code-block:: python

        >>> import numpy as np
        >>> import paddle
        >>> from paddle.autograd.ir_backward import grad as ir_grad
        >>> from paddle.base import core
        >>> from paddle.decomposition import decompose
        >>> def forward(x):
        ...     y = paddle.sin(x)
        ...     z = paddle.cos(y)
        ...     return z

        >>> np_x = np.random.random(size=[4096, 4096]).astype("float32")
        >>> paddle.enable_static()
        >>> core._set_prim_all_enabled(True)
        >>> main_program = paddle.static.Program()
        >>> with paddle.static.program_guard(main_program):
        >>>     x = paddle.static.data(
        >>>         name="x", shape=[4096, 4096], dtype="float32"
        >>>     )
        >>>     x.stop_gradient = False
        >>>     out = forward(x)
        >>>     out_grad = paddle.full(
        >>>         shape=out.shape, fill_value=3, dtype="float32"
        >>>     )
        >>>     [out] = decompose(main_program, [out])
        >>>     [dx] = ir_grad(out, [x], out_grad)
        >>>     main_program, _ = paddle.decomposition.auto_recompute(
        >>>         main_program,
        >>>         [x],
        >>>         [out],
        >>>         grad_outputs=[out_grad],
        >>>         fwd_op_end_idx=2,
        >>>         backward_op_start_idx=4
        >>>     )
        >>>     exe = paddle.static.Executor(paddle.CUDAPlace(0))
        >>>     res = exe.run(
        >>>         feed={'x': np_x},
        >>>         fetch_list=[dx],
        >>>     )
        >>>     print(main_program)
        {
            (%0) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"x",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[false]} : () -> pd_op.tensor<4096x4096xf32>
            (%1) = "pd_op.sin" (%0) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%2) = "pd_op.cos" (%1) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%3) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true],value:(Float)3} : () -> pd_op.tensor<4096x4096xf32>
            (%4) = "pd_op.sin" (%0) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%5) = "pd_op.sin" (%4) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%6) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> pd_op.tensor<1xf32>
            (%7) = "pd_op.scale" (%5, %6) {bias:(Float)0,bias_after_scale:true,stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>, pd_op.tensor<1xf32>) -> pd_op.tensor<4096x4096xf32>
            (%8) = "pd_op.multiply" (%7, %3) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>, pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%9) = "pd_op.cos" (%0) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%10) = "pd_op.multiply" (%9, %8) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>, pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%11) = "pd_op.fetch" (%10) {col:(Int32)0,is_persistable:[true],name:"fetch0",stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
        }
    zprogram before recompute:r   Nr
   c                R   t          j                    }t          j                    }|                                 j        |d          D ]Y}|                                D ]B}|                    |           |                    |          r-|                    |           CZ||z
  }|S r>   )r   ValueSetr3   r4   rD   rF   is_no_need_buffer)r:   r   need_buffer_values
all_valuesr0   op_operand_sourcebw_no_need_buffer_valuess          r$   _get_bw_no_need_buffer_valuesz5auto_recompute.<locals>._get_bw_no_need_buffer_values  s    +466#,..
&&((,-B-C-CD 	: 	:B%'%7%7%9%9 : :!0111''(9:: "&&'89999	:
 $.0B#B ''r&   c                    |                                                                  v o'|                                                                 v S r>   )rQ   r]   )value_node1value_node2fusible_opss     r$   _is_fusiblez#auto_recompute.<locals>._is_fusible  sN    ''))..00K? D++--2244C	
r&   c                t   t          j                    }|                    |            t          |          dk    r{|                                }t          |d          }|D ]?}|vr ||          s dS |vr&t          |          	v r|                    |           @t          |          dk    {dS )Nr   TF)r   r   rF   rg   popfind_value_node_usersget_real_define_op_name)

value_nodecur_value_nodescur_value_nodeusersuserr   r   forward_opsrequired_fw_value_nodesview_opss
        r$   _is_materialized_backwardsz2auto_recompute.<locals>._is_materialized_backwards  s    (133J'''/""Q&&,0022N) 8$ E  	. 	.666{{"D@ @6  44 777/55AA#''--- /""Q&& ur&   c                p      |v rdS t           d          }t           fd|D                        S )NTc              3  0   K   | ]} |          V  d S r>   r+   )r/   r   r   r   s     r$   	<genexpr>z;auto_recompute.<locals>._is_materialized.<locals>.<genexpr>  s/      GG{{:t44GGGGGGr&   )r   all)r   placeholder_value_nodesr   r   r   r   s   `  r$   _is_materializedz(auto_recompute.<locals>._is_materialized  sY    0004%0$
 
 GGGGGGGGGGGGr&   c           
         t          |           }|                                                                 v r|dk    rt          S t	          |dt          t          |          d          d          z  z            } | |          r|S |dz  S )Nr   g?d   r
   r   )cal_value_node_sizerQ   r]   MINIMUM_WEIGHTr   maxmin)r   r   mem_szr   dist_from_bwtending_to_recompute_opss      r$   _get_node_weightz(auto_recompute.<locals>._get_node_weight  s    $Z00 &&((--//3KKK!!! cS\*%=s!C!CQGGGH
 
 J(?@@ 	MA:r&   c                   t           r(|                                                                 v S |                                                                 v rdS |                                                                 vrdS  |           rdS |          t          k    rdS t	          |           }t          |           }t          d |D                       }|dz  |k     S )NFTc              3  4   K   | ]}t          |          V  d S r>   )r   r/   is     r$   r   z=auto_recompute.<locals>._ban_recomputation.<locals>.<genexpr>  s+      EE1!44EEEEEEr&   r   )AGGRESSIVE_RECOMPUTATIONrQ   r]   MAX_DIST_FROM_BWr   get_real_input_nodessum)	r   output_sizer{   inputs_sizer   r   r   r   r5   s	       r$   _ban_recomputationz*auto_recompute.<locals>._ban_recomputation  s    # 	1--//4466:LLL))++00226NNNu))++0022:JJJt *)*55 tJ'*:::t .j99K)*55FEEfEEEEEK?[00r&   builtin.combinezbuiltin.splitzbuiltin.slicezadd edge link from: z -> sinkz (inf) _in)capacityz source z (inf)sourcez(inf)_outTz sink z
Cut Value:c              3  ,   K   | ]}||         fV  d S r>   r+   )r/   nnx_graphs     r$   r   z!auto_recompute.<locals>.<genexpr>  s,      88Q$888888r&   c              3  (   K   | ]}|v |fV  d S r>   r+   )r/   vnon_reachableus     r$   r   z!auto_recompute.<locals>.<genexpr>  s1      AAa=.@.@q!f.@.@.@.@AAr&   zprogram after recompute:zauto-recomputez(Time of auto recompute program: ***** [ z ] ***** seconds.)'r%   networkxtimeclassify_value_noderg   r3   r4    cal_value_nodes_dist_to_backwardr   r   r-   r   r   r   r   r   DiGraphr(   initializedrQ   r]   r   rW   idadd_edgemathinfr   rk   minimum_cutupdaterF   partition_joint_graphr   logging	getLoggersetLevelINFOinfo)4r:   r{   r}   r~   r   r   r   nx
start_timerequired_bw_value_nodesunclaimed_value_nodesall_opsdefault_recomputable_ops
random_opscompute_intensive_opsr   value_id_dictjudge_fusion_loopr   r   r   r   weightr   r   	cut_value	partition	reachablecutsetnbrscut_value_nodesvalue_node_invalue_node_outsaved_valuesprogram_after_recomputefwd_op_end_idx_after_recomputeend_timeloggerr   r   r   r   r   r   r   r   r   r   r   r   r5   r   s4         `                               @@@@@@@@@@@@@@r$   auto_recomputer     s   ~ *G444 J 	G\>BB	 "##q((,AS"F F - - &&""$$(G3( L
  8H( ' 	)**  J17#&;;"S__4K %g..G$V,,F$w.Mzz||H'1CDDg**,,01E>A3E1EFGGK
( 
( 
(  =<&   
 
 
 
 
        (H H H H H H H      &1 1 1 1 1 1 1 1 18 	 
!	"
	 q q

 %%'' 	%%'',,..2CCC%%'',,..*<<1133
 
 88::1=BBDD I
 
 
 000&
vvy   jme3VdhOOO+5M*-(&   *-%/$(     ,6M*-( z**	6555&   *-%/$(     ,6M*-(!!#
 
 	ME!:=6#9F 	 	
 	
 	
 (2jm$%0$
 
  
	 
	D&   &%$(      99;; 	 	D{""$BB..00$   .""   %% *-%"7$( &    ."    %%". &   1	: >>(HfEEIy|Y'''(I}UUF8888i888 B B4AAAAAdAAAAAAA$-//O)/ ( (%~SbS!^CRC%88888"="#56
J''''"L #L
 	 	 	& )+BCCCy{{H 
"#344%%%_x*7L___	
 	
 	
 #$BBBr&   r   list[pir.Value]r   c           	        t          j        |          }t          j        |          }t          | ||||||          }t          d           t          d |D                        t          d           t          d |D                        d}|D ]}	|t	          |	          z  }t          d|dz  dz  dz  d           t          | ||||          \  } }| |fS )	a  
    Partition the joint graph, recompute the intermediate values
    by saved values to save memory.
    Args:
        program(Program): The program to be recomputed.
        saved_values(list[valueiable]): The saved values
            of forward graph which used by backward graph.
        inputs:(list[Value]|tuple(Value)): The input Values
            of the forward graph.
        outputs(list[valueiable]): The out values
            of the forward graph.
        forward_op_end_idx(int): The index of the last forward op.
        backward_op_start_idx(int): The index of the start backward op.
    Returns:
        recomputed_program(Program): The recomputed program.
        fwd_op_end_idx(int): The index of the last forward op in
            recomputed program.
    zsaved values: c                h    g | ]/}d | d|                                                                  d0S (z, )rQ   r   r/   r   s     r$   
<listcomp>z)partition_joint_graph.<locals>.<listcomp>  sD    KKK4A44**,,//11444KKKr&   zmid values: c                h    g | ]/}d | d|                                                                  d0S r   r   r   s     r$   r   z)partition_joint_graph.<locals>.<listcomp>  sD    NNN4A44**,,//11444NNNr&   r   zSaved Memory is: i   GB)r   r   analyze_mid_hold_valuesr%   r   (replace_mid_values_with_forward_subgraph)
r:   r   r{   r}   r   r   r   mid_hold_valuesmemmids
             r$   r   r     s    6 "*<88L%g..G .  O    KKlKKKLLL~NNoNNNOOO
C ( ("3'''"C$J$5$<dCCC G G^ N""r&   c                `   d }t          |           }t          |                                 j        d |dz                      }t          |                                 j        |d                    }|                                 j        |         }	 |||          }
|
d         }|
d         }|
d         }t	          | |||	||          \  }}}|D ]}|                    dd           |D ]}|                    dd           t          j                    }|D ]B}|                    |          }|	                    ||           |
                    |           Ct          |          D ]}|                    |d           }|                                D ]k}|                                D ]T}|                    |d	          r<|2|                    ||                   |                    |          k     r||         }Ul|J |||<   |D ]}|                    ||                    | |fS )
Nc                    fdt                      }t          j                    }|}|D ]} || ||g            t          dt	          |                     t          d|           |||d}|S )Nc                F   t          |          }|                    |            |                                 }||v s|d S |                                dv r| |vr|                    |            d S |                                }t          |          dk    r@|                                dvr*t          d|  d|                                 d|           |D ]/}||v r||vr|                    |             	|||||           0|                    |           d S )N)zbuiltin.parameterz
pd_op.datar   )r   r   zEvery path to recompute value zr must have saved value or starting point of the path is one of op in [pd_op.full, pd_op.full_int_array], but find z op, op ir is )listrE   rQ   r]   rF   rD   rg   	Exception)
recompute_valuer   marked_recompute_opsneeded_saved_valueschain	new_chain	define_op	op_inputsop_input_find_recompute_opss
            r$   r  zreplace_mid_values_with_forward_subgraph.<locals>._extract_forward_recompute_subgraph_for_backward.<locals>._find_recompute_ops  s    UI_---'7799I000I4E~~ $   #*==='++O<<<!1133I9~~""y~~'7'7 @ ( (   d_  d  d  yB  yG  yG  yI  yI  d  d  Yb  d  d   &  |++':::+//999## ('    !$$Y///Fr&   zRecompute Ops: )r{   recompute_opsr}   )r-   r   r   r%   rg   )r   
mid_valuesrecompute_subgraph_opsrecompute_subgraph_inputs*recompute_subgraph_outputs_backward_neededr  recompute_subgraphr  s          @r$   0_extract_forward_recompute_subgraph_for_backwardzbreplace_mid_values_with_forward_subgraph.<locals>._extract_forward_recompute_subgraph_for_backward  s    )	 )	 )	 )	 )	V "%$2$;$=$=!5?2) 	 	O&)    	$c*@&A&ABBB$&<===/3A
 

 "!r&   r
   r  r{   r}   is_recompute_opTis_recompute_bw_opr   )rp   r-   r3   r4   clone_graphset_bool_attrr   r   look_upreplace_grad_users_withrF   re   rw   rI   rW   ry   move_before)r:   r   r  r   r   r  op_2_id_mapr   backward_opsfirst_backward_oprecompute_forward_subgraph
origin_opsorigin_subgraph_inputsorigin_subgraph_outputs
cloned_ops	value_mapcloned_op_first_grad_user_map	origin_op	cloned_opcloned_subgraph_outputsorigin_valuecloned_valuer0   first_subgraph_grad_user
op_outputschilds                             r$   r  r    s   B" B" B"H G$$Kg**,,01E>A3E1EFGGKw++--12G2H2HIJJL,,..23HI 	98*	
 	
  ,O<J7A8C;F< <8J	8   9 9	 148888 < <	 4d;;;; -577/ 2 2 ((66,,\<HHH##L1111 z"" E E#@#D#DR#N#N **,, 	 	J#>>@@  044UA>> /7;;N;N5e<< <#++,DEE<F <F :%@ 1 (333,D%b)) H H	;IFGGGGN""r&   c                   |                                  j        }t          |d |dz                      }t          t	          d|dz                       }t          j        |                                                      |                    }t          t	          |dz   t          |                              }t          j        |                                                      |                    }||t          j                    fS )Nr
   r   )	r3   r4   r-   r
  ranger   r   get_values_by_op_idxrg   )	r:   r~   r   r   required_fw_opsrequired_fw_op_idxsr   required_bw_op_idxsr   s	            r$   r   r   q  s    ""$$(G'"6NQ$6"6788OuQ(:;;<<,5334GHH  u^a%7WFFGG,5334GHH ( 	 !! r&   c                6   t          j                    }|                                 }|r| |v rfd|D             }|D ]}|                                dk    r|                                d         }|                                D ]}|                                }	|	D ]}
t          |
                                          dk    rr|
                                d                                         dv rD|
                                d                                         }|t          j        |          z  }|                    |
           Ȑ|                                }	|	D ]}
t          |
                                          dk    rr|
                                d                                         dv rD|
                                d                                         }|t          j        |          z  }|                    |
           |S )zP
    Find all the value nodes which use the same value node to be computed.
    c                    g | ]}|v |	S r+   r+   )r/   r0   r   s     r$   r   z)find_value_node_users.<locals>.<listcomp>  s#    999"r['8'82'8'8'8r&   r   r   r
   r   )r   r   rW   r]   rI   rg   rF   )r   r   without_no_need_bufferr   r   r4   r0   combine_resultcombine_res_used_oprI   rY   split_resultss      `        r$   r   r     sR    #%%E

/
/
1
1C :1119999999C $& $&7799)))ZZ\\!_N  ::<<* *#-5577% * *F99;;  %@@BB dff !    )/(I(I(K(K)!')) & !8!G!GG		&))))*	*( jjllG! & &5577  !<<>>qAFFHH M   %+$E$E$G$G%gii " ^4]CCCEEIIf%%%%& Lr&   c                V   t          j                    }|                                 }|                                dv rC|                                d         }|                                }|                                }n|                                }|D ]}|                                rf|                                                                dk    r<|t          j        |                                                                          z  }||                    |           |S )Nr   r   r   )r   r   rQ   r]   rD   rF   )output_value_nodereal_input_nodesr  r  real_define_opinput_value_nodesinput_value_nodes          r$   r   r     s&   %.00!1133I~~===,,..q1!1133*::<<%5577- 	3 	3,,..	3 00227799=NNN 7 0022BBDD! !    !12222r&   c                    |                                  }|                                dv r@|                                d         }|                                                                 S |                                S )Nr   r   )rQ   r]   rD   )r   r  r  s      r$   r   r     sm    **,,I~~===,,..q1''))..000~~r&   c                F    	 d| j         v S #  t          d|  d          xY w)Nz!value node not found in program:  )shape
ValueErrorr   s    r$   is_dynamic_value_noderN    s:    LZ%%%LJZJJJKKKs     c                    	 |                                                                  d uS #  t          d|  d          xY w)Nzvalue node illegal: rJ  )typeas_vec_typerL  rM  s    r$   is_vector_value_noderR    sK    ?  ,,..d::?=
===>>>s   '* ?c                    t          |           rd | j        D             }n| j        }t          j        d |d          t          | j                 z  S )Nc                    g | ]
}|d k    |S )rI  r+   r   s     r$   r   z,cal_value_node_size_impl.<locals>.<listcomp>  s    CCC!177A777r&   c                    | |z  S r>   r+   )xys     r$   <lambda>z*cal_value_node_size_impl.<locals>.<lambda>  s
    a!e r&   r
   )rN  rK  	functoolsreduce_PADDLE_DTYPE_2_NBYTESdtype)r   value_node_shapes     r$   cal_value_node_size_implr^    sa    Z(( ,CCz'7CCC%+++-=qAA
 !1
2	3r&   c                    t          |           rS|                                                                                                 }d}|D ]}|t	          |          z  }|S t	          |           S Nr   )rR  rP  rQ  as_listr^  )r   	value_vecsum_res
child_nodes       r$   r   r     su    J'' OO%%1133;;==	# 	< 	<J/
;;;GG#J///r&   c                   t          j                    }t          |           D ]}|                                dk    r|                                }|D ]}|                                }t          |          dk    r|d                                         dv rFt          |          }||vrd||<   _t          d          ||<   |D ]$}t          ||         ||         dz             ||<   %|S )Nr   r
   r   r   g    eA)
r   	ValueDictre   r]   rI   rW   rg   r   r   r   )	r   r   r   r0   
op_results	op_resultused_ops
real_usersr   s	            r$   r   r     s$   !+--Lw  7799)))ZZ\\
# 	 	I ;;==H8}}!!hqk&6&6&8&8 = ' ' .y99J 777*+Y''*-c((Y'&  D.1$Y/d1Ca1G/ /L++	  r&   c                f    d }t          t          ||                                                    S )Nc                    |                                  dk    r:t          |                     d                                                    dk    rdS dS )Nr   r   FT)r]   rg   rY   rW   )r0   s    r$   filter_unused_combinez;all_used_op_consider_combine.<locals>.filter_unused_combine)  sI    GGII***BIIaLL;;==>>!CC5tr&   )r
  filterrW   )r:   rL   rm  s      r$   all_used_op_consider_combinero  (  s?       $e&F&F&H&HII  r&   c                   t          |                                 j        d |dz                      }t          |                                 j        |d                    t          j                    }|D ]}	|	                                D ]m}
t          | |
          }t          fd|D                       r@|
|vr<|
|vr8|
|vr4|
|vr0|	                                t          vr|
                    |
           n|S )Nr
   c              3      K   | ]}|v V  	d S r>   r+   )r/   used_opr$  s     r$   r   z*analyze_mid_hold_values.<locals>.<genexpr>F  s(      HHG|+HHHHHHr&   )r-   r3   r4   r   r   rI   ro  anyr]   r   rF   )r:   r   r{   r}   no_need_buffer_valuesr   r   r   r  r0   rY   all_used_opsr$  s               @r$   r  r  6  s#    g**,,01E>A3E1EFGGKw++--12G2H2HIJJL$-//O , ,jjll 
	, 
	,F7HHLHHHH<HHHHH,,..'))&(("777GGIIZ//##F+++
	, r&   c                    d }|                                  d                                         D ]6}||v r0|,|                    |          |                    |          k     r|}7|S r`  )rI   rW   ry   )fwd_opr$  r#  first_backward_use_opuser_ops        r$   get_first_backward_use_oprz  Q  s     >>##A&AACC , ,l""!)""7++!!"7889 9 %,!  r&   c           	        t          j        |           |                                 j        }t          j                                         }t          |          }g }i }	|D ]}
|                    |
|
           |D ]}||v r|                    |t          j         	                    ddd                    }t          |||          }|`|                    d          rK|                    d          r6|                    d|j                   |                    d|j                   |                    |           |||	|<   t          j        |                                            |||	fS )NFTop_rolechunk_id)r   set_insertion_pointr3   r4   paddle	IrMappingr-   rF   cloneCloneOptionsrz  has_attrset_int_attrr|  r}  rE    set_insertion_point_to_block_end)r:   r'  graph_inputsclone_insertion_opr$  r#  r   r+  r*  r,  input_valuer0   r`   rx  s                 r$   r  r  ]  s    .///""$$(G
$$&&IZJJ$&!# 0 0k;//// N NXX6:225$EE F %>L+% %! &1)229== 2)22:>> 2 ##I/D/LMMM##J0E0NOOOf%%%$08M-f5()=)=)?)?@@@y"???r&   r>   )r:   rz   r{   r|   r}   r|   r~   r|   r   r   r   r   r   r   r   r   )r:   rz   r   r   r{   r   r}   r   r   r   r   r   r   r   r   r   )@
__future__r   rY  r   r   r   r   typingr   r  r   paddle.autogradr   paddle.baser   paddle.base.frameworkr   collections.abcr	   DataTypeBOOLFLOAT16BFLOAT16FLOAT32FLOAT64FLOAT8_E4M3FNFLOAT8_E5M2INT8INT16INT32INT64UINT8	COMPLEX64
COMPLEX128r[  r   __annotations__r   r   r   r   r   r   r   r   r%   r(   rp   r   r   r  r   r   r   r   rN  rR  r^  r   r   ro  r  rz  r  r+   r&   r$   <module>r     s   # " " " " " "       				                     * * * * * *       4 4 4 4 4 4 )(((((( 	MM1MAM1M1MMqMMMMMMQMb $T' T' T'  T T T Tp '     
     KKK
 K K K K	$ 	$ 	$  	 	 	 	 
     !  ! ! !a
 a
 a
 a
 a
 a
 a
 a
H	6 	6 	6 	6 	6 	6 	6 	6& .2QC QC QC QC QCh;# ;# ;# ;#|~# ~# ~#B! ! !N   	3 3 3 3l  ,     L L L? ? ?  0 0 0  4    6	! 	! 	!#@ #@ #@ #@ #@r&   