
    jp                    J   U d dl mZ d dlZd dlZd dlmZ d dlmZmZmZm	Z	 d dl
mZmZmZ d dlZd dlmZ erd dlmZ dd	lmZ g d
Z ed          Z ed          Z eej        d          sH ed          ej        j        d<    ed          ej        j        d<    ed          ej        j        d<   d dlmZmZmZ d0dZd1dZ G d de          Z  G d d          Z!e	dede"f         f         Z#de$d<   e	 	 	 d2d3d)            Z%e	 	 	 d2d4d,            Z%	 	 	 d2d5d/Z%dS )6    )annotationsN)Callable)overloadTYPE_CHECKING	TypeAliasUnion)	ParamSpecSelfTypeVar)Tensor)_POOL_HANDLE   )_dummy_type)is_current_stream_capturinggraph_pool_handle	CUDAGraphgraphmake_graphed_callables_R_P_CudaStreamBase
_CUDAGraph_graph_pool_handle_cuda_isCurrentStreamCapturing)r   r   r   returnboolc                     t                      S )zReturn True if CUDA graph capture is underway on the current CUDA stream, False otherwise.

    If a CUDA context does not exist on the current device, returns False without initializing the context.
    )r        [/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/torch/cuda/graphs.pyr   r   -   s    
 *+++r   r   c                 X    t           j                            t                                S )zReturn an opaque token representing the id of a graph memory pool.

    See :ref:`Graph memory management<graph-memory-management>`.

    .. warning::
        This API is in beta and may change in future releases.
    )torchcudar   r   r   r   r    r   r   6   s!     :""#5#7#7888r   c                       e Zd ZdZdd fdZ	 dd fdZd  fdZd  fdZd  fdZd  fdZ	d! fdZ
d  fdZd" fdZd# fdZd# fdZ xZS )$r   a-  Wrapper around a CUDA graph.

    Arguments:
        keep_graph (bool, optional): If ``keep_graph=False``, the
            cudaGraphExec_t will be instantiated on GPU at the end of
            ``capture_end`` and the underlying cudaGraph_t will be
            destroyed. Users who want to query or otherwise modify the
            underlying cudaGraph_t before instantiation can set
            ``keep_graph=True`` and access it via ``raw_cuda_graph`` after
            ``capture_end``. Note that the cudaGraphExec_t will not be
            instantiated at the end of ``capture_end`` in this
            case. Instead, it will be instantiated via an explicit called
            to ``instantiate`` or automatically on the first call to
            ``replay`` if ``instantiate`` was not already called. Calling
            ``instantiate`` manually before ``replay`` is recommended to
            prevent increased latency on the first call to ``replay``. It
            is allowed to modify the raw cudaGraph_t after first calling
            ``instantiate``, but the user must call ``instantiate`` again
            manually to make sure the instantiated graph has these
            changes. Pytorch has no means of tracking these changes.

    .. warning::
        This API is in beta and may change in future releases.

    F
keep_graphr   r   r
   c                H    t                                          | |          S N)super__new__)clsr%   	__class__s     r    r)   zCUDAGraph.__new__]   s    wwsJ///r   Nglobalpool_POOL_HANDLE | Nonecapture_error_modestrNonec                N    t                                          ||           dS )a  Begin capturing CUDA work on the current stream.

        Typically, you shouldn't call ``capture_begin`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_begin`` internally.

        Arguments:
            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
            capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
                unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
        )r-   r/   N)r(   capture_begin)selfr-   r/   r+   s      r    r3   zCUDAGraph.capture_begin`   s)    & 	4<NOOOOOr   c                H    t                                                       dS )aG  End CUDA graph capture on the current stream.

        After ``capture_end``, ``replay`` may be called on this instance.

        Typically, you shouldn't call ``capture_end`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_end`` internally.
        N)r(   capture_endr4   r+   s    r    r6   zCUDAGraph.capture_endu   s!     	r   c                H    t                                                       dS )a$  Instantiate the CUDA graph. Will be called by
        ``capture_end`` if ``keep_graph=False``, or by ``replay`` if
        ``keep_graph=True`` and ``instantiate`` has not already been
        explicitly called. Does not destroy the cudaGraph_t returned
        by ``raw_cuda_graph``.
        N)r(   instantiater7   s    r    r9   zCUDAGraph.instantiate   s!     	r   c                H    t                                                       dS )z,Replay the CUDA work captured by this graph.N)r(   replayr7   s    r    r;   zCUDAGraph.replay   s    r   c                H    t                                                       dS )z1Delete the graph currently held by this instance.N)r(   resetr7   s    r    r=   zCUDAGraph.reset   s    r   r   c                D    t                                                      S )zReturn an opaque token representing the id of this graph's memory pool.

        This id can optionally be passed to another graph's ``capture_begin``,
        which hints the other graph may share the same memory pool.
        )r(   r-   r7   s    r    r-   zCUDAGraph.pool   s     ww||~~r   c                D    t                                                      S )z/Enable debugging mode for CUDAGraph.debug_dump.)r(   enable_debug_moder7   s    r    r@   zCUDAGraph.enable_debug_mode   s    ww((***r   
debug_pathc                F    t                                          |          S )z
        Arguments:
            debug_path (required): Path to dump the graph to.

        Calls a debugging function to dump the graph if the debugging is
        enabled via CUDAGraph.enable_debug_mode()
        )r(   
debug_dump)r4   rA   r+   s     r    rC   zCUDAGraph.debug_dump   s     ww!!*---r   intc                D    t                                                      S )a}  Returns the underlying cudaGraph_t. ``keep_graph`` must be True.

        See the following for APIs for how to manipulate this object: `Graph Managmement <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html>`_ and `cuda-python Graph Management bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-management>`_
        )r(   raw_cuda_graphr7   s    r    rF   zCUDAGraph.raw_cuda_graph   s    
 ww%%'''r   c                D    t                                                      S )a  Returns the underlying cudaGraphExec_t. ``instantiate`` must have been called if ``keep_graph`` is True, or ``capture_end`` must have been called if ``keep_graph`` is False. If you call ``instantiate()`` after ``raw_cuda_graph_exec()``, the previously returned cudaGraphExec_t will be destroyed. It is your responsibility not to use this object after destruction.

        See the following for APIs for how to manipulate this object: `Graph Execution <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH__EXEC.html>`_ and `cuda-python Graph Execution bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-execution>`_
        )r(   raw_cuda_graph_execr7   s    r    rH   zCUDAGraph.raw_cuda_graph_exec   s    
 ww**,,,r   )F)r%   r   r   r
   )Nr,   )r-   r.   r/   r0   r   r1   r   r1   r   r   )rA   r0   r   r1   )r   rD   )__name__
__module____qualname____doc__r)   r3   r6   r9   r;   r=   r-   r@   rC   rF   rH   __classcell__)r+   s   @r    r   r   B   s        40 0 0 0 0 0 0 KSP P P P P P P*	 	 	 	 	 	                    + + + + + +. . . . . .( ( ( ( ( (- - - - - - - - - -r   r   c                  B    e Zd ZU dZdZded<   	 	 	 dddZddZddZdS )r   a  Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.

    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
    detailed use, and constraints.

    Arguments:
        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
        capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
            Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
            unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_

    .. note::
        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.

    .. warning::
        This API is in beta and may change in future releases.

    .. _cudaStreamCaptureMode:
        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
    Ntorch.cuda.Stream | Nonedefault_capture_streamr,   
cuda_graphr   r-   r.   streamr/   r0   c                N   |4| j         j        (t          j                                        | j         _        |dn|f| _        ||n| j         j        | _        | j        t          d          t          j                            | j                  | _	        || _
        || _        d S )Nr   zcapture_stream must not be None)r+   rR   r"   r#   Streamr-   capture_streamAssertionErrorrT   
stream_ctxrS   r/   )r4   rS   r-   rT   r/   s        r    __init__zgraph.__init__   s     >dnCK49J4E4E4G4GDN1;?<RRdW	(FFdn.S 	 & !BCCC*++D,?@@$"4r   r   r1   c                v   t           j                                         t           j        j        j        rt          j                     t           j                                         t           j	        
                                 | j                                          | j        j        | j        d| j        i d S )Nr/   )r"   r#   synchronizecompilerconfigforce_cudagraph_gcgccollectempty_cache_C_host_emptyCacherY   	__enter__rS   r3   r-   r/   )r4   s    r    re   zgraph.__enter__   s    
   > 3 	 JLLL
   !!### 	!!###%%Y	
  $6		
 	
 	
 	
 	
r   argsobjectc                V    | j                                           | j        j        |  d S r'   )rS   r6   rY   __exit__)r4   rf   s     r    ri   zgraph.__exit__  s.    ##%%%  $''''r   )NNr,   )rS   r   r-   r.   rT   rQ   r/   r0   rI   )rf   rg   r   r1   )	rK   rL   rM   rN   rR   __annotations__rZ   re   ri   r   r   r    r   r      s          : 8<;;;;
 %)+/"*5 5 5 5 5.
 
 
 
6( ( ( ( ( (r   r   torch.nn.Module.r   _ModuleOrCallable   F	callablessample_argstuple[Tensor, ...]num_warmup_itersrD   allow_unused_inputr-   r.   c                    d S r'   r   rn   ro   rq   rr   r-   s        r    r   r     s	     r   tuple[_ModuleOrCallable, ...]tuple[tuple[Tensor, ...], ...]c                    d S r'   r   rt   s        r    r   r     s	     %(Cr   1_ModuleOrCallable | tuple[_ModuleOrCallable, ...]3tuple[Tensor, ...] | tuple[tuple[Tensor, ...], ...]c                  )* t          j                    r"t          j                    rt          d          d}t	          | t
                    s.d}| f} t          j        t
          t          df         |          f}n4t          j        t
          t
          t          df         df         |          }g )t          | |          D ]\  }}t	          |t           j
        j                  rt          |j                  dk    r0t          |j                  dk    rt          |j                  dk    st!          d          t#          d |                                D                       st!          d          t          j        j        j        | }	)                    t          |	                     t#          d	 |	D                       st!          d
          d )D             }
d | D             *)*fdt/          t          |                     D             }d t/          t          |                     D             }d t/          t          |                     D             }|t1                      n|}t           j                                         t           j                            t           j                                                  5  t          | ||          D ]\  }}}d\  }}}t/          |          D ]}t           j        j                             ||           }t          d |D                       }t          |          dk    rRt           j                            |t          d |D                       t          d |D                       d|          }|||fD ]}~	 ddd           n# 1 swxY w Y   t           j                                         g }g }t          | ||          D ]\  }}}t           j                             ||          5   || }ddd           n# 1 swxY w Y   t           j        j        !                    |          \  }}|                    t          |                     |                    |           g }g }t          tE          |          tE          |          tE          |                    D ]Z\  }}}t          d |D                       } t          d |D                       }d}t          |          dk    rt           j                             ||          5  t           j                            |t          d |D                       t          d | D                       d|          }ddd           n# 1 swxY w Y   g }!d}"|D ]A}#|#j#        r#|!|!                    ||"                    |"dz  }",|!                    d           Bt          |!          }!|                    |            |                    |!           \|$                                 |$                                 d6d-}$g }%tK          |           D ]\  }&} |$||&         ||&         *|&         |
|&         ||&         ||&         ||&         ||&         ||&         	  	        }'t	          |t           j
        j                  r7d7d5}( |(||j&        |'|j'                  |_'        |%                    |           |%                    |'           |r|%d         S t          |%          S )8a  Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.

    Each graphed callable's forward pass runs its source callable's
    forward CUDA work as a CUDA graph inside a single autograd node.

    The graphed callable's forward pass also appends
    a backward node to the autograd graph. During backward, this node runs the
    callable's backward work as a CUDA graph.

    Therefore, each graphed callable should be a drop-in replacement for its source callable
    in an autograd-enabled training loop.

    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.

    If you pass a tuple of several callables, their captures will use the same memory pool.
    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.

    Arguments:
        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
            they'll run in the live workload.
        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
            11 iterations for warm up. Default: ``3``.
        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
            (and therefore their grad is always zero) is an error. Defaults to False.
        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
    .. note::
        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
        that's expected for the corresponding real input in the training loop.

    .. warning::
        This API is in beta and may change in future releases.

    .. warning::
        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.

    .. warning::
        Returned callables do not support higher order differentiation (e.g., double backward).

    .. warning::
        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
        may be trainable. Buffers must have ``requires_grad=False``.

    .. warning::
        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
        you may not add or remove any of that Module's parameters or buffers.

    .. warning::
        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
        through :func:`~torch.cuda.make_graphed_callables` is allowed.

    .. warning::
        When running a graphed callable, you must pass its arguments in the same order and format
        they appeared in that callable's ``sample_args``.

    .. warning::
        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
    z_make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`.FT.r   zModules must not have hooks registered at the time they are passed. However, registering hooks on modules after passing them through make_graphed_callables is allowed.c              3  (   K   | ]}|j         d u V  dS )FNrequires_grad.0bs     r    	<genexpr>z)make_graphed_callables.<locals>.<genexpr>  s)      EEAq%/EEEEEEr   zIn any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have ``requires_grad=False``.c              3  J   K   | ]}t          |t          j                  V  d S r'   )
isinstancer"   r   )r   args     r    r   z)make_graphed_callables.<locals>.<genexpr>  s.      HHS:c5<00HHHHHHr   zfIn the beta API, sample_args for each callable must contain only Tensors. Other types are not allowed.c                ,    g | ]}t          |          S r   )len)r   rf   s     r    
<listcomp>z*make_graphed_callables.<locals>.<listcomp>  s    !L!L!L#d))!L!L!Lr   c                    g | ]D}t          |t          j        j                  r!t	          |                                          nd ES )r   )r   r"   nnModuletuple
parameters)r   cs     r    r   z*make_graphed_callables.<locals>.<listcomp>  sP     " " " ",Aux!?!?GallnnR" " "r   c                2    g | ]}|         |         z   S r   r   )r   iflatten_sample_argsper_callable_module_paramss     r    r   z*make_graphed_callables.<locals>.<listcomp>  s9     * * * 	A!;A!>>* * *r   c                J    g | ] }t           j                                        !S r   r"   r#   r   r   _s     r    r   z*make_graphed_callables.<locals>.<listcomp>  &    HHHQ%*&&((HHHr   c                J    g | ] }t           j                                        !S r   r   r   s     r    r   z*make_graphed_callables.<locals>.<listcomp>  r   r   N)NNNc              3  (   K   | ]}|j         	|V  d S r'   r|   r   os     r    r   z)make_graphed_callables.<locals>.<genexpr>  s)      $K$K11?$KQ$K$K$K$K$K$Kr   c              3  (   K   | ]}|j         	|V  d S r'   r|   r   r   s     r    r   z)make_graphed_callables.<locals>.<genexpr>  s=       % %"#q%% % % % % %r   c              3  L   K   | ]}|j         	t          j        |          V   d S r'   r}   r"   
empty_liker   s     r    r   z)make_graphed_callables.<locals>.<genexpr>  sH       + +45AO+!,Q//+ + + + + +r   )outputsinputsgrad_outputsonly_inputsallow_unused)r-   c              3  P   K   | ]!}|j         rt          j        |          nd V  "d S r'   r   r   s     r    r   z)make_graphed_callables.<locals>.<genexpr>  sJ       $
 $
AB1?<EQ$
 $
 $
 $
 $
 $
r   c              3  (   K   | ]}|j         	|V  d S r'   r|   r   s     r    r   z)make_graphed_callables.<locals>.<genexpr>  s)      JJ1!/JQJJJJJJr   c              3  (   K   | ]}|j         	|V  d S r'   r|   r   s     r    r   z)make_graphed_callables.<locals>.<genexpr>  s)       T TqAO T T T T T T Tr   c              3     K   | ]}||V  	d S r'   r   r   s     r    r   z)make_graphed_callables.<locals>.<genexpr>  s"      &W&WQq&W&Wr      	fwd_graphr   	bwd_graphmodule_paramstuple[torch.nn.Parameter, ...]len_user_argsrD   output_unflatten_spectorch.utils._pytree.TreeSpecstatic_input_surfacerp   static_outputsstatic_grad_outputstuple[Tensor | None, ...]static_grad_inputsr   Callable[..., object]c	           	     t    
  G  fddt           j        j                  
d
fd}	|	S )Nc                      e Zd Zedfd            Zeej        j        j        d fd	                        Z	d
S )Omake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphedctxrg   r   r   r   rp   c                   t                    D ]Y}|                                         ||                                         k    r!|                             ||                    Z                                 t	          t
                    st          dt                               t          d D                       S )Nz"static_outputs must be tuple, got c              3  >   K   | ]}|                                 V  d S r'   detachr   s     r    r   zjmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward.<locals>.<genexpr>  s*      @@AQXXZZ@@@@@@r   )rangedata_ptrcopy_r;   r   r   rX   type)r   r   r   r   r   r   r   s      r    forwardzWmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward  s     }-- A AA+A.7799VAY=O=O=Q=QQQ,Q/55fQi@@@  """!.%88 (ST.=Q=QSS   @@@@@@@@r   gradsc                   t          |          t                    k    r/t          dt          |           dt                               t          |          D ]F\  }}|?|                                |                                k    r|                    |           G                                 t          t                    st          dt                               t          d D                       S )Nzlen(grads)=z != len(static_grad_outputs)=z&static_grad_inputs must be tuple, got c              3  F   K   | ]}||                                 n|V  d S r'   r   r~   s     r    r   zkmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward.<locals>.<genexpr>3  sH          #$-AHHJJJQ     r   )	r   rX   zipr   r   r;   r   r   r   )r   r   ggradr   r   r   s       r    backwardzXmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward  s$    u::%8!9!999(ic%jjiisSfOgOgii    ##6>> * *GAt} ::<<4==??::GGDMMM  """ ""4e<< ([FXAYAY[[      0     r   N)r   rg   r   r   r   rp   )r   rg   r   r   r   rp   )
rK   rL   rM   staticmethodr   r"   autogradfunctiononce_differentiabler   )r   r   r   r   r   r   r   s   r    Graphedr     s        
A 
A 
A 
A 
A 
A 
A 
A \
A ^$8       98 \  r   r   	user_argsrg   r   c                     t          j        j        j        |  } j        t          |          z    }t           j        j                            |          S r'   )r"   utils_pytreearg_tree_leavesapplyr   tree_unflatten)r   flatten_user_argsoutr   r   r   s      r    functionalizedzVmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.functionalized9  sP     !& 3 CY O'-%(9":":]"JLC;&55c;PQQQr   )r   rg   r   rg   )r"   r   Function)r   r   r   r   r   r   r   r   r   r   r   s   ````````` @r    make_graphed_autograd_functionz>make_graphed_callables.<locals>.make_graphed_autograd_function  s    (	 (	 (	 (	 (	 (	 (	 (	 (	 (	 (	 (	 (	en- (	 (	 (	T	R 	R 	R 	R 	R 	R 	R 	R r   funcrk   graph_training_stater   graphedCallable[_P, _R]orig_fwdc                      d fd}|S )	Nr   _P.argsuser_kwargs	_P.kwargsr   r   c                 :    j         k    r | i |S  | i |S r'   )training)r   r   r   r   r   r   s     r    new_fwdzEmake_graphed_callables.<locals>.make_graphed_forward.<locals>.new_fwdZ  s=     }(<<<&w	A[AAA'xBkBBBr   )r   r   r   r   r   r   r   )r   r   r   r   r   s   ```` r    make_graphed_forwardz4make_graphed_callables.<locals>.make_graphed_forwardT  sC    C C C C C C C C C r   )r   r   r   r   r   r   r   rD   r   r   r   rp   r   rp   r   r   r   rp   r   r   )
r   rk   r   r   r   r   r   r   r   r   )(r"   is_autocast_enabledis_autocast_cache_enabledRuntimeErrorr   r   typingcastr   r   r   r   r   _backward_hooks_forward_hooks_forward_pre_hooksrX   allbuffersr   r   r   appendr   r   r#   r\   rT   rV   tree_leavesr   r   r   tree_flattenreversedr}   reverse	enumerater   r   )+rn   ro   rq   rr   r-   just_one_callable_sample_argsr   rf   flatten_argper_callable_len_user_args"per_callable_static_input_surfaces
fwd_graphs
bwd_graphsmempoolr   r   grad_inputsr   outputs_gradr   vper_callable_static_outputs"per_callable_output_unflatten_specr   func_outputsflatten_outputsspec per_callable_static_grad_outputsper_callable_static_grad_inputsr   r   r   r   grad_idxr   r   retr   r   r   r   r   s+                                            @@r    r   r   %  s	   R  "" 
u'F'H'H 
m
 
 	
  i'' P L	E&#+$6DDF{5vs{);S)@#A;OOy,//  4a)) 	A%&&!++())Q..,--22$a   EEEEEEE $1  
 k)94@""5#5#5666HHKHHHHH 	 ^  	 "M!L8K!L!L!L" "" " "* * * * *s9~~&&* * *&
 IH%I2G2GHHHJHH%I2G2GHHHJ%)\!!!tG
 
J			5:,,..	/	/  03|%G1
 1
 	 	,D$, 2B.K,+,,  +-99$$+FF$$K$K$K$K$KKK|$$q(("'."5"5 ,$ % %';% % %     &+ + +9@+ + + & & %)%7 #6 
# 
#K |[9  A'	              . 
J #%)+&!$Yj!I!I 8 8dIZig66 	' 	'4;L	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' !& 3 @ @ N N#**5+A+ABBB*11$7777 (*$&(#;>344,--< < %C %C7ni $ $
 $
FT$
 $
 $
 
 
 JJJJJJJ|q  !!)'!::  #n11(  T T,@ T T TTT!&&W&W2E&W&W&W!W!W $!3 2                  ' 	0 	0C  0[%<"))+h*?@@@A"))$////"#566(//0CDDD'../ABBBB %,,...#++---= = = =@ $&CY'' $  $ 400qMqM&q)&q).q1.q1'*,Q/+A.

 

 dEHO,, 	      0/dmWdl DL JJtJJw 1v::s8   COOO&P88P<	?P<	AV,,V0	3V0	)r   r   rJ   )rm   FN)rn   rl   ro   rp   rq   rD   rr   r   r-   r.   r   rl   )rn   ru   ro   rv   rq   rD   rr   r   r-   r.   r   ru   )rn   rx   ro   ry   rq   rD   rr   r   r-   r.   r   rx   )&
__future__r   r`   r   collections.abcr   r   r   r   r   typing_extensionsr	   r
   r   r"   r   
torch.cudar   _utilsr   __all__r   r   hasattrrc   __dict__torch._Cr   r   r   r   r   r   r   rg   rl   rj   r   r   r   r    <module>r     s   " " " " " " " 				  $ $ $ $ $ $ < < < < < < < < < < < < 6 6 6 6 6 6 6 6 6 6         (''''''               WT]]Yt__ wux*++ &1k,&?&?EHl#.9k:N.O.OEH*+:E+(; ;EH67 T S S S S S S S S S, , , ,9 9 9 9q- q- q- q- q-
 q- q- q-hT( T( T( T( T( T( T( T(p  %%6f8M%MN  N N N N 
 $ $    
 
 $ $( ( ( ( 
( $ $I I I I I I Ir   