o
    :/iz                     @   s  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlZd dlmZ d dlmZ d d	lmZ d dlmZ d d
lm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z: ddl;m<Z<m=Z= ddl>m?Z?m@Z@ ddlAmBZB e)eCZDdeEeF deEejGdB  dedef dedef fddZHde de4fd d!ZIG d"d# d#ZJG d$d% d%eKZLejMG d&d' d'ZNd(ejOdePfd)d*ZQd+eRejOeFf d,eEeF ddfd-d.ZSd/ejTd0eEeU deVejTeEeN f fd1d2ZWd3aXd4ed5e"de d6ePd7ePdefd8d9ZYG d:d; d;ejjZZ[d<a\eUe]d=< d>a^ePe]d?< edGd@eUdAePdedB fdCdDZ_G dEdF dFZ`dS )H    N)defaultdict)Callable	GeneratorSequence)contextmanager)deepcopy)partialAny)dynamo_timed)trace_structured)CompilationConfigCUDAGraphMode
VllmConfig)DynamicShapesType)Rangehash_factors)init_logger)lazy)current_platform)
instrumentinstrument_manual)resolve_obj_by_qualname   )CompilerInterfaceEagerAdaptorInductorAdaptorInductorStandaloneAdaptoris_compile_cache_enabled)compilation_counter)inductor_partition_rule_contextshould_split)InductorPasspass_context)PostGradPassManagersym_tensor_indicesinput_bufferscallable_fn.returnc                    s   dt dt f fdd}|S )a  Create a wrapper that copies inputs to static buffers before calling.

    This is used for cudagraph input copying where we need to copy dynamic
    tensors to static buffers before invoking the compiled graph.

    Args:
        sym_tensor_indices: Indices of tensors with symbolic shapes
        input_buffers: List of static buffers (can contain None for lazy init)
        callable_fn: The compiled function to call

    Returns:
        A wrapper function that copies inputs and calls the compiled function
    argsr(   c                     sn   t | }tD ]*\}}|| }|jd }| d u r!| |< | d | }|| |||< q | S Nr   )list	enumerateshapeclonecopy_)r)   	list_argsiindexruntime_tensorruntime_shapestatic_tensorr'   r&   r%    f/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/compilation/backends.pycopy_and_callG   s   


z)make_copy_and_call.<locals>.copy_and_callr	   )r%   r&   r'   r9   r7   r6   r8   make_copy_and_call4   s   r:   compilation_configc                 C   s   t jr
t js
J d| jdkr*t jr"ttjdr"td t	| j
S td t S | jdkr7td t S td| j tt  }t|tsLJ |S )	NzCVLLM_USE_MEGA_AOT_ARTIFACT=1 requires VLLM_USE_STANDALONE_COMPILE=1inductorstandalone_compilezUsing InductorStandaloneAdaptorzUsing InductorAdaptoreagerzUsing EagerAdaptorzUsing custom backend: %s)envsVLLM_USE_MEGA_AOT_ARTIFACTVLLM_USE_STANDALONE_COMPILEbackendhasattrtorch	_inductorloggerdebugr   compile_cache_save_formatr   r   r   r   get_compile_backend
isinstancer   )r;   compilerr7   r7   r8   make_compilerY   s(   




rL   c                   @   s   e Zd ZdZdeddfddZdedefdd	Ze	d
e
ded fddZ	d%dedededdfddZd&ddZdejdee ded
e
dedef dB f
ddZedd		 d'dejdee d!eeef ded
e
ded"edefd#d$ZdS )(CompilerManagera  
    A manager to manage the compilation process, including
    caching the compiled graph, loading the compiled graph,
    and compiling the graph.

    The cache is a dict mapping
    `(runtime_shape, graph_index, backend_name)`
    to `any_data` returned from the compiler.

    When serializing the cache, we save it to a Python file
    for readability. We don't use json here because json doesn't
    support int as key.
    r;   r(   Nc                 C   s(   t  | _d| _|| _t|| _i | _d S NF)dictcacheis_cache_updatedr;   rL   rK   loaded_artifacts)selfr;   r7   r7   r8   __init__   s
   

zCompilerManager.__init__vllm_configc                 C   s   | j |S N)rK   compute_hash)rS   rU   r7   r7   r8   rW      s   zCompilerManager.compute_hashcompile_rangeNNNc              	   c   s    t |2 | jjr$t| jj dV  W d   n1 sw   Y  ndV  W d   dS W d   dS 1 s:w   Y  dS )zProvide compilation context for the duration of compilation to set
        any torch global properties we want to scope to a single Inductor
        compilation (e.g. partition rules, pass context).N)r#   r;   use_inductor_graph_partitionr    splitting_ops)rS   rX   r7   r7   r8   compile_context   s   
" zCompilerManager.compile_contextF 	cache_dirdisable_cacheprefixc                    s   || _ || _tj|d| _|sZtj| jrZt| j}t	|
 }W d   n1 s.w   Y  dtdtddfdd dtdttttf f fd	d
fdd| D | _| jj|||d dS )a  
        Initialize the cache directory for the compiler.

        The organization of the cache directory is as follows:
        cache_dir=/path/to/hash_str/rank_i_j/prefix/
        inside cache_dir, there will be:
        - vllm_compile_cache.py
        - computation_graph.py
        - transformed_code.py

        for multiple prefixes, they can share the same
        base cache dir of /path/to/hash_str/rank_i_j/ ,
        to store some common compilation artifacts.
        zvllm_compile_cache.pyNvaluetyr(   c                 S   s,   t | |std| dt|  d|  d S )Nz	Expected z	 but got z for )rJ   	TypeErrortype)ra   rb   r7   r7   r8   
check_type   s   
z4CompilerManager.initialize_cache.<locals>.check_typekeyc                    sd   | \}}} |t   |t t|tr(|\}} |t   |t  t||d} |t |||fS )N)startend)intstrrJ   tupler   )rf   range_tuplegraph_indexcompiler_namerg   rh   )re   r7   r8   	parse_key   s   







z3CompilerManager.initialize_cache.<locals>.parse_keyc                    s   i | ]	\}} ||qS r7   r7   ).0rf   ra   )ro   r7   r8   
<dictcomp>   s    z4CompilerManager.initialize_cache.<locals>.<dictcomp>)r^   r_   r`   )r_   r^   ospathjoincache_file_pathexistsopenastliteral_evalreadr
   rd   rk   r   ri   rj   itemsrP   rK   initialize_cache)rS   r^   r_   r`   frP   r7   )re   ro   r8   r|      s    
z CompilerManager.initialize_cachec                 C   sd   | j s| jsd S tjdd}|| j}t| jd}|| W d    d S 1 s+w   Y  d S )N   )indentw)	r_   rQ   pprintPrettyPrinterpformatrP   rw   ru   write)rS   printerdatar}   r7   r7   r8   save_to_file   s   "zCompilerManager.save_to_filegraphexample_inputsrm   .c           	      C   s   ||| j jf| jvrd S dtdttttf tf fdd}z|| j||| j jf \}}W n
 ty7   Y d S w | j |||||}|| j|< t	
d|t|| j j| |S )Nra   r(   c                 S   sJ   t | tsJ | d }t |d tsJ t |d tsJ | d }||fS )Ngraph_handler   r   	cache_key)rJ   rO   rj   )ra   handler   r7   r7   r8   parse_value   s   z)CompilerManager.load.<locals>.parse_valuezGDirectly load the %s-th graph for compile range %sfrom %s via handle %s)rK   namerP   r
   rk   rj   	ExceptionloadrR   rF   rG   )	rS   r   r   rm   rX   r   r   r   compiled_graphr7   r7   r8   r      s,   "

zCompilerManager.loadzCompile graph	span_namer   r   additional_inductor_config
num_graphsc                    s  |dkrt  at jd7  _d }||||}|d ur=||d kr;t  t }	| j|	7  _tjdt	||	dd |S t
jtrFd }
nd}
|
|j d|j 7 }
|
d| 7 }
| d  tjjjj fd	d
}ddlm} tjjjddU |d|@ zj|||||
\}}W n) ty    d usJ j   Y W  d    W  d    W  d    S w W d    n1 sw   Y  W d    n1 sw   Y   d ur|d ur|j < W d    n1 sw   Y  |d usJ dt|r:|d ur:| dj||jjf< t jd7  _d_|dkr-tdt	| t d|t	|jj| ||d krYt  t }	| j|	7  _tjdt	||	dd |S )Nr   r   zTDirectly load the compiled graph(s) for compile range %s from the cache, took %.3f slocalscopeartifact_compile_range__
_subgraph_c                     s6   | i |}|d u rd S |d   j v rt |S r*   )rR   StopCompiling)r)   kwargsresultr   origrS   r7   r8   autograd_cache_keyD  s   
z3CompilerManager.compile.<locals>.autograd_cache_key)patchT)autograd_cache_normalize_inputsz@torch._functorch._aot_autograd.autograd_cache.autograd_cache_keyzFailed to compile the graph)r   r   z1Cache the graph of compile range %s for later usez?Store the %s-th graph for compile range%s from %s via handle %sz3Compiling a graph for compile range %s takes %.2f s)!timeperf_countercompilation_start_timer   num_backend_compilationsr   compilation_timerF   	info_oncerj   rJ   rK   r   rg   rh   r\   rD   
_functorch_aot_autogradautograd_cacher   unittest.mockr   configcompiler   rR   r   rP   r   num_cache_entries_updatedrQ   rG   )rS   r   r   r   r;   rX   rm   r   r   elapsed	maybe_keyr   r   r   r7   r   r8   r      s   
	3 
9
	zCompilerManager.compile)Fr]   r(   N)r   r   )__name__
__module____qualname____doc__r   rT   r   rj   rW   r   r   r   r\   boolr|   r   fxGraphModuler+   r
   ri   r   r   r   rO   r   r7   r7   r7   r8   rM   u   s`    

4
(
	rM   c                   @   s   e Zd ZdS )r   N)r   r   r   r7   r7   r7   r8   r     s    r   c                   @   s0   e Zd ZU eed< eed< eed< ejed< dS )	SplitItemsubmod_namegraph_idis_splitting_graphr   N)	r   r   r   rj   __annotations__ri   r   r   r   r7   r7   r7   r8   r     s
   
 r   nodec                 C   s   | j dkr
| jdkS | j dkrdS | j}|tjtjtjfv r dS t|tjjr+|j	}nt|tjj
r7| }ndS |dpB|dS )Ncall_method	new_emptycall_functionFTzaten::emptyzaten::new_empty)optargetrD   empty
empty_likeempty_stridedrJ   _opsOpOverloadPacket_qualified_op_name
OpOverloadr   
startswith)r   r   packet_namer7   r7   r8   _is_empty_allocation_node  s   



r   node_to_subgraph_idsplit_op_graphsc                    s   t t}  D ]\}}|| | qt|}dt  dd}t|d D ]@}||g }|s3q(||v}t	|dkoBt
|d }	d}
|	rbdurb|d }t fdd|jD rb |< d	}
|
sh|rh|q(dS )
z
    Merge a partition that only contains an empty allocation op into the
    previous partition. This avoids generating standalone empty submodules,
    which can lead to empty cudagraph captures.
    N)defaultr   r   Fc                 3   s&    | ]}|j d kp | kV  qdS )placeholderNr   )rp   
input_noder   prev_non_splitting_subgraph_idr7   r8   	<genexpr>  s    


z._merge_empty_only_subgraphs.<locals>.<genexpr>T)r   r+   r{   appendsetmaxvaluesrangegetlenr   allall_input_nodes)r   r   nodes_by_subgraph_idr   subgraph_idsplitting_subgraphsmax_subgraph_idnodesis_non_splitting_subgraphis_empty_only_subgraphmerged
empty_noder7   r   r8   _merge_empty_only_subgraphs  s0   
r   r   r[   c                    s\  d}i  g }| j jD ]P}|jdv rq
|jdkr4|jtjkr4|jd }|jdkr4| v s-J  |  |< q
t||rV|d7 }| |< || t|j	|rQ|d8 }q
|d7 }q
| |< q
t
 | tjjjj| d  fdddd	}g }d
d | D }|D ]$}	d|	v s|	dkrq}t||	}
t|	dd}|t|	|||v |
 q}|jdd d ||fS )Nr   )outputr   r   r   r   c                    s    |  S rV   r7   r   r   r7   r8   <lambda>      zsplit_graph.<locals>.<lambda>T)keep_original_orderc                 S   s   g | ]\}}|qS r7   r7   )rp   r   moduler7   r7   r8   
<listcomp>  s    zsplit_graph.<locals>.<listcomp>.r]   submod_c                 S   s   | j S rV   )r   )xr7   r7   r8   r     s    )rf   )r   r   r   r   operatorgetitemr)   r!   r   nextr   rD   r   passessplit_modulenamed_modulesgetattrri   replacer   sort)r   r[   r   r   r   r   split_gmoutputsnamesr   r   r   r7   r   r8   split_graph  sD   










r  g        piecewise_backendrU   is_first_graphis_last_graphc              	   C   sJ   |j  r|jr
| S ddlm} tt }|| |tj	||| |ddS )a?  
    Wrap a piecewise backend with CUDA graph wrapper if needed.
    This function is shared between VllmBackend and
    construct_serializable_fn_from_inductor_cache.

    Args:
        piecewise_backend: The backend to wrap
        vllm_config: The vLLM configuration
        compilation_config: The compilation configuration
        is_first_graph: Whether this is the first graph in the sequence
        is_last_graph: Whether this is the last graph in the sequence

    Returns:
        The wrapped backend if CUDA graphs are enabled, otherwise the original backend
    r   )CUDAGraphOptions)debug_log_enable
gc_disableweak_ref_output)runnablerU   runtime_modecudagraph_options)
cudagraph_modehas_piecewise_cudagraphsrZ   
cuda_graphr  r   r   get_static_graph_wrapper_clsr   	PIECEWISE)r  rU   r;   r  r  r  static_graph_wrapper_classr7   r7   r8   wrap_with_cudagraph_if_needed!  s&   r  c                
       s   e Zd ZdZdejjdee de	ddddf
 fd	d
Z
edddedef fddZdejjjdeejjjdf deeef defddZ  ZS )PiecewiseCompileInterpretera  Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
    It runs the given split graph interpreter, and for each submodule in
    `compile_submod_names`, creates a PiecewiseBackend and compiles all
    ranges up front.

    NOTE: the order in `compile_submod_names` matters, because
    it will be used to determine the order of the compiled piecewise
    graphs. The first graph will handle logging, and the last graph
    has some special cudagraph output handling.

    Note: This class shares similar logic with
    reconstruct_serializable_fn_from_mega_artifact in caching.py.
    Both create PiecewiseBackend instances and wrap them with cudagraph.
    The key difference is:
    - reconstruct_serializable_fn_from_mega_artifact: PiecewiseBackend receives
      pre-compiled runnables (compiled_runnables is set, graph is None)
    - this class: PiecewiseBackend receives the FX graph to compile
      (graph is set, compiled_runnables is None)


    If modifying the backend creation/wrapping logic, consider updating both.
    r   compile_submod_namesrU   vllm_backendVllmBackendr(   Nc                    s0   t  | || _|j| _|| _|| _d| _d S rN   )superrT   r  r;   rU   r  extra_traceback)rS   r   r  rU   r  	__class__r7   r8   rT   o  s   
z$PiecewiseCompileInterpreter.__init__zInductor compilationr   r)   c                    s   t  j| S rV   )r  run)rS   r)   r  r7   r8   r  ~  s   zPiecewiseCompileInterpreter.runr   .r   c              
   C   s   t |tsJ t| j|}|j jd }t|dd }|| j	v rl| j	
|}| |}dd t|D }	ddlm}
 ddlm} ||| j|t| j	|	| j|
||d	}t|| j| j|j|j| jj|< t jd7  _|S )
Nr   c                 S   s
   | j d S )Nexample_value)metar   r7   r7   r8   r        
 z9PiecewiseCompileInterpreter.call_module.<locals>.<lambda>c                 S   s    g | ]\}}t |tjr|qS r7   )rJ   rD   SymIntrp   r1   r   r7   r7   r8   r     s
    z;PiecewiseCompileInterpreter.call_module.<locals>.<listcomp>)graph_returns_tupler   PiecewiseBackend)r   )rJ   rj   r   r   r   output_noder)   r   map_argr  r2   
fetch_attrr,   torch._inductor.compile_fxr$  r  r&  rU   r   r  r  r;   r  r  __dict__r   $num_piecewise_capturable_graphs_seen)rS   r   r)   r   gmr  r   r2   submodsym_shape_indicesr$  r&  r  r7   r7   r8   call_module  s>   

z'PiecewiseCompileInterpreter.call_module)r   r   r   r   rD   r   r   r+   rj   r   rT   r   r
   r  r   Targetrk   ArgumentrO   r0  __classcell__r7   r7   r  r8   r  W  s0    
r  backbone	model_tagFmodel_is_encodertag
is_encoderrY   c                 c   sP    | t ksJ d|  dt  dt }t}| a |az
dV  W |a |adS |a |aw )z%Context manager to set the model tag.z
Model tag z  is the same as the current tag r   N)r5  r6  )r7  r8  old_tagold_is_encoderr7   r7   r8   set_model_tag  s   
r;  c                	   @   s  e Zd ZU dZeed< eed< dZeed< e	j
ed< e	j
ed< ee ed< ed	ef ed
< eed	ef  ed< eed< eeef ed< 		ddedededdfddZdeeeeee f dB eeef dB f fddZd ddZdd Zedde	j
dee defddZdS )!r  a  The compilation backend for `torch.compile` with vLLM.
    It is used for compilation mode of `CompilationMode.VLLM_COMPILE`,
    where we customize the compilation.

    The major work of this backend is to split the graph into
    piecewise graphs, and pass them to the piecewise backend.

    This backend also adds the PostGradPassManager to Inductor config,
    which handles the post-grad passes.
    rU   r;   F_calledr   r  piecewise_graphs.returned_callablepost_grad_passescompiler_managerinductor_configr]   r`   r8  r(   Nc                 C   sX   |pt | _|pt| _tt  | _tj| _|| _	|j
| _
t| j
| _t| j
j| _d S rV   )r5  r`   r6  r8  r   r   get_pass_manager_clspass_managerpass_keyrU   r;   rM   r@  r   inductor_compile_configrA  )rS   rU   r`   r8  r7   r7   r8   rT     s   

	zVllmBackend.__init__c              	   C   s   t jsdS ddlm} ddlm} | }i }i }| j D ]B\}}t| j|}t	|dr/|j
n|}	t|	|s7q|}
|	j||
< |	j||
< |	  D ]\}}||
|| td|
|t| qIqtd| | |  tdt|j  |||fS )	a  Collect inductor cache artifacts from all piecewise backends.

        Returns:
            tuple: (standalone_compile_artifacts, sym_shape_indices_map,
                    returns_tuple_map)
                - standalone_compile_artifacts: StandaloneCompiledArtifacts
                  with compiled artifacts
                - sym_shape_indices_map: dict mapping submod_name to
                  sym_shape_indices
                - returns_tuple_map: dict mapping submod_name to
                  returns_tuple
        rY   r   )StandaloneCompiledArtifactsr%  r  z-collected artifact for %s shape %s (%d bytes)z=collected artifacts: %d entries, %d artifacts, %d bytes totalz$standalone compile artifact keys: %s)r?   r@   cachingrF  r  r&  r  named_childrenr   rC   r  rJ   r/  returns_tupleto_bytesr{   insertrF   rG   r   infonum_entriesnum_artifacts
size_bytesr+   submodule_byteskeys)rS   rF  r&  standalone_compile_artifactssym_shape_indices_mapreturns_tuple_mapr   r   childr  r   	shape_str
bytes_datar7   r7   r8   $collect_standalone_compile_artifacts  sF   


	
z0VllmBackend.collect_standalone_compile_artifactsc                 C   st   | j | j | j| jv r1t| j| j trtdt| jj	| j t
s&J | j | jj	| j  | j | j| j< d S )Nz9PostGradPassManager can not be kept in CompilationConfig.)rC  	configurerU   rD  rA  rJ   r$   
ValueErrorr;   rE  r"   addrS   r7   r7   r8   configure_post_passN  s   zVllmBackend.configure_post_passc                    s^   j   jdtdB dtfddfddtD tdd	d
  fdd
d dS )z4Log vLLM compilation config for TORCH_TRACE/tlparse.lstNr(   c                 S   s    | d u rdS d dd | D S )Nr]   z, c                 s   s    | ]}t |V  qd S rV   )rj   rp   r   r7   r7   r8   r   l      zKVllmBackend._log_compilation_config.<locals>.list_to_str.<locals>.<genexpr>rt   )r^  r7   r7   r8   list_to_stri  s   z8VllmBackend._log_compilation_config.<locals>.list_to_strc                    s0   g | ]}t t |jtrt |jr|jqS r7   )rJ   r   r   r   )rp   r}   )pass_cfgr7   r8   r   o  s    
z7VllmBackend._log_compilation_config.<locals>.<listcomp>artifactc                   S   s
   dddS )Nvllm_compilation_configjson)r   encodingr7   r7   r7   r7   r8   r   w  s   z5VllmBackend._log_compilation_config.<locals>.<lambda>c                      sv   t jjjjt j j j	 j
t j j j jt j t jj jjdS )N)modelr`   moderB   
custom_opsr[   r  compile_sizescompile_ranges_endpointsrZ   inductor_passesenabled_passesdynamic_shapes_typedynamic_shapes_evaluate_guards)rf  dumpsrU   model_configrh  r`   rj   ri  rB   rj  r[   r  rk  rl  rZ   r+   rm  rQ  dynamic_shapes_configrd   evaluate_guardsr7   )ccrn  rb  rS   r7   r8   r   {  s&    
metadata_fn
payload_fn)r;   pass_configr+   rj   dataclassesfieldsr   r\  r7   )ru  rn  rb  rc  rS   r8   _log_compilation_configc  s   

z#VllmBackend._log_compilation_configr  r   c           ,   
      s*  ddl m} j}  t }t|}| }j|}t	t
jjtdtfdd g }	D ]:}
|	|
 |
dkrCq7zt|
}|	|  W d    n1 sZw   Y  W q7 ttfyq   td|
 Y q7w td|	  }jj  jjs||||g}tt|  d d	 }tjtj d
|}|j_jj}tj!|dd |j_|j"j#}|j"j$}tj|d| d| j%}tj!|dd |j_&t'j( }|j)d uo|j)* }|p|}|rtj+ddd ntj+d|dd j,||j% td||||| zCtdtt-t.j/|dd| tj|d}tj0|sVt|d}t1j2||||d|ddd W d    n	1 sQw   Y  W n t3yi   tjd|dd Y nw t4 j5d7  _5ddl6m7} t89 | }tj+d|dd j j:|7  _:t;|d  }d!|i}t<d"|d | j=rJ d#|_>?  jj@rg }njjApg }tB||\_C_Dd }tjErtFjC}d$d%lGmH} |d&j> |d'jC tId(d)d fd*dd+ t4 jJtKjD7  _Jd,d- jDD }g  |j>jLd.d/D ]} |jMd0  q
 fd1d-tND }tOjC|jjP|  t89 } jQ  t89 |  }!|!dkrItj+d2|!dd d$d3lRmS}" |" }#jjTjUrjjTjVtWjXkrd$d4lYmZ}$ |#j[j\] D ]\}%}&|&j^dkr|$d$|&j_|#j[j\|%< qmtj|d5}'tj0|'sd6jCj`d7d8 }(|(ad9d:}(t|'d}|b|( W d    n	1 sw   Y  tjcd;|'dd d_=tjEr|nj>})jjdtejfks܈jjgs||)j%jCjhd<S d$d=limj fd>d-tN|D }*tk|*fd?d-|*D jC}+||)j%|+jh|*d@S )ANr   )VllmSerializableFunctionz9Traced files (to be considered for compilation cache):
%sc                      s
   d  S )N
ra  r7   )forward_code_filesr7   r8   r     r!  z&VllmBackend.__call__.<locals>.<lambda>z<string>zFailed to read file %sr~  
   torch_compile_cacheT)exist_okrank_r   z'vLLM's torch.compile cache is disabled.r   r   z2Using cache directory: %s for vLLM's torch.compilezAtorch.compile cache factors: env=%s cfg=%s comp=%s code=%s dir=%sz2Compile env factors (raw):
%s
Vllm config hash: %sx   )widthzcache_key_factors.jsonr   )envconfig_hash	code_hashcompiler_hash   )r   	sort_keyszCould not write compile cache metadata at %s; continuing without metadata. Compiled cache remains valid; diagnostics may be limited.)exc_info)torch_compile_start_timez&Dynamo bytecode transform time: %.2f sg    eAzdynamo.time_secondszDynamo bytecode transformz#VllmBackend can only be called oncer   )lazy_format_graph_codezbefore splitzafter split
graph_dumpc                   S   s   ddiS )Nr   vllm_piecewise_split_graphr7   r7   r7   r7   r8   r   A  r   c                      s    j jddS )NFprint_output)r  print_readabler7   r\  r7   r8   r   B  s    rv  c                 S   s   g | ]}|j s|jqS r7   )r   r   )rp   itemr7   r7   r8   r   F  s    z(VllmBackend.__call__.<locals>.<listcomp>r   r   r  c                    s(   g | ]\}}t |tjr | n|qS r7   )rJ   rD   Tensor)rp   r1   t)all_fake_valuesr7   r8   r   Q  s    z-Saved compiler manager cache in %.2f seconds.)detect_fake_mode)ValueRangeszcomputation_graph.pyz0from __future__ import annotations
import torch
Fr  z<lambda>r   zComputation graph saved to %s)r8  r  is_symbolicc                    s>   g | ]\}}t |tjjjrt fd d| D r|qS )c                 3   s    | ]} |V  qd S rV   r7   )rp   dr  r7   r8   r     r`  z2VllmBackend.__call__.<locals>.<listcomp>.<genexpr>)rJ   rD   _subclassesfake_tensor
FakeTensoranysizer#  r  r7   r8   r     s    c                    s   g | ]} |   qS r7   )r.   r_  )r   r7   r8   r     s    )r8  r  r%   )lrG  r}  rU   r|  r?   compile_factorsr   rW   r@  r+   sortedr;   traced_filesrF   rG   r   r   rw   rz   OSErrorUnicodeDecodeErrorwarninghashlibsha256rt   encode	hexdigestclearr^   rj   rr   rs   VLLM_CACHE_ROOTmakedirsparallel_configrankdata_parallel_indexr`   local_cache_dirr   rA  speculative_configuse_ngram_gpur   r|   r   r   r   rv   rf  dumpr   r   num_graphs_seenmonitorr  r   r   r   ri   r   r<  r   r]  rZ   r[   r  r  r=  r@   r   torch._dynamo.utilsr  r   num_piecewise_graphs_seenr   
find_nodesr   r,   r  r  r   torch._guardsr  rs  rt  rd   r   BACKEDtorch.utils._sympy.value_rangesr  	shape_envvar_to_ranger{   lowerupperr  r   r   
debug_oncer  r   NONEcudagraph_copy_inputsr8  %torch.fx.experimental.symbolic_shapesr  r:   ),rS   r   r   r}  rU   env_factorsenv_hashr  r  hash_contentfilepathr}   r  factorshash_keyr^   r  dp_rankr  r_   is_ngram_gpu_enabled	meta_pathr  dynamo_time
start_time
attributesfx_split_opsoriginal_split_gmr  submod_names_to_compiler1   	fake_argstime_before_savingr   r  	fake_moder  sr
graph_pathsrcgraph_to_serializer%   r9   r7   )r  r   r  r  rS   r8   __call__  s  








	




zVllmBackend.__call__)r]   Fr   )r   r   r   r   r   r   r   r<  r   r   r   r+   r   r   r
   r   rM   rO   rj   rT   rk   ri   rX  r]  r|  r   r  r7   r7   r7   r8   r    s<   
 


'(

>."r  )F)arx   rz  r  rf  r   rr   r   r   collectionsr   collections.abcr   r   r   
contextlibr   copyr   	functoolsr   typingr
   rD   torch.fxr   r  r   torch._logging._internalr   	vllm.envsr?   vllm.configr   r   r   vllm.config.compilationr   vllm.config.utilsr   r   vllm.loggerr   vllm.logging_utilsr   vllm.platformsr   vllm.tracingr   r   vllm.utils.import_utilsr   compiler_interfacer   r   r   r   r   counterr   partition_rulesr    r!   passes.inductor_passr"   r#   passes.pass_managerr$   r   rF   r+   ri   r  r:   rL   rM   BaseExceptionr   	dataclassr   Noder   r   rO   r   r   rj   rk   r  r   r  Interpreterr  r5  r   r6  r;  r  r7   r7   r7   r8   <module>   s   



%  
*
D
6^