o
    :/i8                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dl	Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ eeZdejdee fddZdejdedee fddZe j G dd dZ!G dd dZ"dS )    N)Callable)PicklerAny)CachingAutotuner)trace_structured)VllmBackend)
VllmConfig)Range)init_loggergraphreturnc                 C   s6   g }| j jD ]}|jdkr||jd  q |S |S )z4Get fake args directly from graph placeholder nodes.placeholderexample_value)r   nodesopappendmeta)r   	fake_argsnode r   o/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/compilation/piecewise_backend.pyget_fake_args_from_graph   s   
r   sizec                    sZ  ddl m} ddlm} ddlm}m dtdtffdd || d	}g }|y | j	j
D ]c}|jd
kr8 nc|jd }t|tjrK| | q/t|tjrt fdd|jD }	t fdd| D }
 | }||	|
|}tj||j|jd}||	|
|}|| q/|| q/W d   |S W d   |S 1 sw   Y  |S )zCreate Fake example inputs with symbolic dims replaced by a concrete size.

    Used for single-size compilation where we need concrete-shaped inputs.
    The Dynamo-captured graph gives us example inputs with SymInts in them.
    r   )compute_required_storage_length)FakeTensorMode)ShapeEnvis_symbolicsym_valr   c                    s6    | st | S | jj}t |fdd|jD S )z@Replace all symbolic variables in a SymInt expression with size.c                    s   i | ]}| qS r   r   .0s)r   r   r   
<dictcomp>3   s    z<create_concrete_args.<locals>.concretize.<locals>.<dictcomp>)intr   exprsubsfree_symbols)r   r$   )r   r   r   r   
concretize.   s   z(create_concrete_args.<locals>.concretize)	shape_envr   r   c                 3       | ]} |V  qd S Nr   )r    dr'   r   r   	<genexpr>@       z'create_concrete_args.<locals>.<genexpr>c                 3   r)   r*   r   r   r,   r   r   r-   A   r.   )dtypedeviceN)torch._prims_commonr   torch._subclasses.fake_tensorr   %torch.fx.experimental.symbolic_shapesr   r   r   r#   r   r   r   r   
isinstancetorchSymIntr   Tensortupleshapestridestorage_offsetemptyr/   r0   
as_strided)r   r   r   r   r   	fake_modeargsr   val	new_shapenew_stridesnew_storage_offsetneeded_sizetr   )r'   r   r   r   create_concrete_args$   s@   



rF   c                   @   s6   e Zd ZU eed< dZeed< dZede	f ed< dS )
RangeEntrycompile_rangeFcompiledN.runnable)
__name__
__module____qualname__r
   __annotations__rI   boolrJ   r   r   r   r   r   r   rG   N   s   
 rG   c                   @   s   e Zd Z		d"dejdB dedededee ded	e	d
e
eedef f dB defddZdedef dedef fddZde
eef fddZd#ddZdefddZd#ddZdededB fddZdedefd d!ZdS )$PiecewiseBackendN r   vllm_configpiecewise_compile_indextotal_piecewise_compilessym_shape_indicesvllm_backendreturns_tuplecompiled_runnables.submod_namec
                 C   s  t |dut |duA sJ d|| _|| _|j| _|| _|| _|| _|| _|	| _|dk| _	||d k| _
|dk| _|j| _| j | _| jrad}
| jd }|j|jjksWJ t|j|
d| jd< d| j }t| | jj| _d	| j }t| || _|| _i | _| jdur| jD ]+}t|tr|d
ksJ tdt|tsJ t||d}|| jvrt|d| j|< q| jD ]
}t|d| j|< qd| _ | jdur| !  dS | "  dS )ad  
        The backend for piecewise compilation.
        It mainly handles the compilation of static shapes and
        dispatching based on runtime shape.

        We will compile `self.graph` once for the general shape,
        and then compile for different shapes specified in
        `compilation_config.compile_sizes`.

        This class supports two mutually exclusive modes:
        1. Compilation (graph is set, compiled_runnables is None):
           Used during initial compilation when we have the FX graph
           and need to compile it for each shape range.
        2. Precompilation (graph is None, compiled_runnables is set):
           Used when loading from cache/AOT artifacts where we already
           have pre-compiled callables and don't need the original graph.

        Exactly one of graph or compiled_runnables must be provided.
        Nz:exactly one of graph and compiled_runnables should be set.r      istartendz"PiecewiseBackend: compile_ranges: z!PiecewiseBackend: compile_sizes: cudagraph_capture_sizeszmcudagraph_capture_sizes not supported in compile_sizes.This should be handled in `post_init_cudagraph_sizes`.)rH   F)#rO   r   rR   compilation_configrS   rT   rV   rX   rY   is_first_graphis_last_graphis_full_graph
is_encoderis_encoder_compilationget_compile_rangescompile_rangesr^   scheduler_configmax_num_batched_tokensr
   r]   logger
debug_oncecompile_sizesrU   rW   range_entriesr4   strNotImplementedErrorr#   rG   _graph_loggedcompile_all_rangesload_all_ranges)selfr   rR   rS   rT   rU   rV   rW   rX   rY   	max_int32last_compile_range
log_stringr   ranger   r   r   __init__V   sn   











zPiecewiseBackend.__init__compiled_graphr   c                    s   dt dt f fdd}|S )Nr?   r   c                     s(    |  }j st|ttfs|S |d S )Nr   )rW   r4   r8   list)r?   graph_outputry   rs   r   r   compiled_graph_wrapper   s   zKPiecewiseBackend.get_compiled_graph_wrapper.<locals>.compiled_graph_wrapperr   )rs   ry   r}   r   r|   r   get_compiled_graph_wrapper   s   
z+PiecewiseBackend.get_compiled_graph_wrapperc                    s|   G dd dt  dtdtf dtf fdd}i }| j D ]\}}|js,td| qt	|j
d	r;||j
|t|< q|S )
Nc                   @   s   e Zd ZdedefddZdS )zEPiecewiseBackend.to_bytes.<locals>.StandaloneCompiledArtifactsPicklerobjr   c                 S   s(   t |tr|  tjt|ffS tS r*   )r4   r   prepare_for_picklepickleloadsdumpsNotImplemented)rs   r   r   r   r   reducer_override   s   
zVPiecewiseBackend.to_bytes.<locals>.StandaloneCompiledArtifactsPickler.reducer_overrideN)rK   rL   rM   objectr   r   r   r   r   r   "StandaloneCompiledArtifactsPickler   s    r   fn.r   c                    sn   t | ds	J dtjjdd |  }t } || |	 }W d    |S 1 s0w   Y  |S )N	serializezfn must have serialize methodbundled_autograd_cacheT)
hasattrr5   
_functorchconfigpatchr   ioBytesIOdumpgetvalue)r   entryfresultr   r   r   r      s   

z,PiecewiseBackend.to_bytes.<locals>.serializez9entry with range %s not compiled, so cannot get its bytesr   )r   r   r   bytesrm   itemsrI   rj   debugr   rJ   rn   )rs   r   out	range_keyr   r   r   r   to_bytes   s   
zPiecewiseBackend.to_bytesc              
   C   s   | j dus	J d| j D ]8}|jrq| |j |j r(t| j |jj}nt	| j }| j
jj| j || j
j| j|j| j| jd|_d|_qdS )z?Compile all range entries for this piecewise subgraph up front.NzoCannot compile without a graph. When loading from cache/AOT artifacts, compile_all_ranges should not be called.)rH   graph_index
num_graphsT)r   rm   valuesrI   _log_compile_startrH   is_single_sizerF   r]   r   rV   compiler_managercompileinductor_configr`   rS   rT   rJ   )rs   range_entry	args_listr   r   r   rq      s.   



z#PiecewiseBackend.compile_all_rangesrH   c                    s   j duo
 jj v jjtddd  fddd js@d_jdus/J tdfd	dfd
dd dS dS )z.Log compilation event for TORCH_TRACE/tlparse.Nartifactc                   S   s
   dddS )Nvllm_piecewise_compile_startjson)nameencodingr   r   r   r   r   <lambda>  s   z5PiecewiseBackend._log_compile_start.<locals>.<lambda>c                
      s$   t j j j  dS )N)piecewise_indexrY   rT   compile_range_startcompile_range_endr   is_cudagraph_capture_size)r   r   rT   r]   r^   r   r   rH   is_cudagraph_sizers   subgraph_indexrY   r   r   r   "  s    )metadata_fn
payload_fnT
graph_dumpc                      s   dd  iS )Nr   vllm_r   r   )rY   r   r   r   6  s   
c                      s    j jddS )NF)print_output)r   print_readabler   )rs   r   r   r   9  s    )rl   r]   rS   rY   r   rp   r   )rs   rH   r   r   r   r     s$   


z#PiecewiseBackend._log_compile_startc                 C   sz   | j dus	J d| j D ],}|jrqt|j}|| j v s.J d|j dt| j   | | j | |_	d|_qdS )zLoad all pre-compiled runnables for this piecewise subgraph.

        Called during warm start to wrap all cached compiled_runnables
        into range_entry.runnable up front, analogous to compile_all_ranges()
        for the cold start path.
        Nzgload_all_ranges should only be called when compiled_runnables is set (warm start / cache loading path).z$Missing compiled runnable for range z. Available keys: T)
rX   rm   r   rI   rn   rH   rz   keysr~   rJ   )rs   r   keyr   r   r   rr   <  s"   

z PiecewiseBackend.load_all_rangesruntime_shapec                 C   sP   | j d u rd S || j v r| jt||d S | jD ]}||v r%| j|   S qd S )Nr\   )rl   rm   r
   rg   )rs   r   rw   r   r   r   _find_range_for_shapeT  s   


z&PiecewiseBackend._find_range_for_shaper?   c                 G   sV   || j d  }| |}|d usJ d| d| j |js&J d|j |j| S )Nr   zShape: z out of considered ranges: z[All ranges should be compiled or loaded up front in PiecewiseBackend.__init__. range_entry=)rU   r   rg   rI   rH   rJ   )rs   r?   r   r   r   r   r   __call__c  s   


zPiecewiseBackend.__call__)NrQ   )r   N)rK   rL   rM   fxGraphModuler	   r#   rz   r   rO   dictrn   r   r   rx   r~   r   r   rq   r
   r   rr   rG   r   r   r   r   r   r   rP   U   sB    
	

k



$!
'rP   )#dataclassesr   r   r   collections.abcr   r   typingr   torch._functorch.configr5   torch.fxr   )torch._inductor.runtime.triton_heuristicsr   torch._logging._internalr   vllm.compilation.backendsr   vllm.configr	   vllm.config.utilsr
   vllm.loggerr   rK   rj   r   rz   r   r#   rF   	dataclassrG   rP   r   r   r   r   <module>   s*   *