o
    :/iF                     @   s  d dl Zd dlZdd Zdd Ze  d dlZd dlmZ d dlm	Z	 ee
Zdejd	< dejd
< dejj_dd Zdee fddZd dedefddZd!ddZdd Ze  e	drd dlmZ d dlmZ d dlmZmZ eeg dejjjd< ee_ee_ dS dS )"    Nc                  C   s   zTt jd} | sW dS | jrtj| j}n| jr | jd }nW dS tj|d}tj	|s3W dS t j
d|}|r?|jsBW dS t j|}|j| t|ddW S  ty^   Y dS w )zPeripheral function to _maybe_set_cuda_compatibility_path().
    PyTorch version must not be determined by importing directly
    because it will trigger the CUDA initialization, losing the
    chance to set the LD_LIBRARY_PATH beforehand.
    torchNr   z
version.pyztorch.versioncuda)	importlibutil	find_specoriginospathdirnamesubmodule_search_locationsjoinexistsspec_from_file_locationloadermodule_from_specexec_modulegetattr	Exception)spec
torch_rootversion_pathver_specmodule r   ^/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/env_override.py_get_torch_cuda_version   s*   
r   c            	         s:  t jdd  dv } | sdS t jdd}|r t j|s8t jdd}t j|d}|r8t j|r8|}|r@t j|sSt }|rSd	| d
}t j|rS|}|r[t j|s]dS t j	| t jdd}|rr|
t jng }|r|d rt j	|d  krdS  g fdd|D  }t j|t jd< dS )a  Set LD_LIBRARY_PATH for CUDA forward compatibility if enabled.

    Must run before 'import torch' since torch loads CUDA shared libraries
    at import time and the dynamic linker only consults LD_LIBRARY_PATH when
    a library is first loaded.

    CUDA forward compatibility is only supported on select professional and
    datacenter NVIDIA GPUs. Consumer GPUs (GeForce, RTX) do not support it
    and will get Error 803 if compat libs are loaded.
    VLLM_ENABLE_CUDA_COMPATIBILITY0)1trueNVLLM_CUDA_COMPATIBILITY_PATH CONDA_PREFIXzcuda-compatz/usr/local/cuda-z/compatLD_LIBRARY_PATHr   c                    s$   g | ]}|rt j| kr|qS r   )r   r	   normpath).0p	norm_pathr   r   
<listcomp>O   s    z6_maybe_set_cuda_compatibility_path.<locals>.<listcomp>)r   environgetstriplowerr	   isdirr   r   r$   splitpathsep)	enablecuda_compat_pathconda_prefixconda_compattorch_cuda_versiondefault_pathexistingld_paths	new_pathsr   r'   r   "_maybe_set_cuda_compatibility_path'   s4    r:   )init_loggeris_torch_equalr   PYTORCH_NVML_BASED_CUDA_CHECKTORCHINDUCTOR_COMPILE_THREADS   c                    s  dd l m  m ddlm}m}m}m}m} ddl	m
  dtt f fdd}t jj|r@ jjjd ur@| jjjj}n j }| jrrt| jd |rr| jd jj|vrr| j  | jrrt| jd |rr| jd jj|vsY| g}g }	tt| jD ]/}
| j|
 }t||r||d | j|
< qt||r||  qt||r|	|  q|	|  t|dksJ d S )Nr   )EnterSubgraphLineExitSubgraphLineMemoryPlanningLineMemoryPlanningStateSubgraphPythonWrapperCodegenVreturnc                    s   dd l }g }|d}|d}| D ]3}t|jr)| jj dt|  qt|jr>| jj dt|  q||	  q|S )Nr   _none_shape)
	itertoolscount
isinstanceNoneAsConstantBufferappendgraphnamenextShapeAsConstantBufferget_name)graph_outputsrK   namesshape_counternone_counternoderG   irr   r   get_output_names   s   

z3memory_plan_reuse_patched.<locals>.get_output_names)torch._inductor.ir	_inductorr[   torch._inductor.codegen.wrapperrA   rB   rC   rD   rE   torch._inductor.virtualizedrG   liststrrM   rP   wrapper_codepartition_signaturesoutput_nodesr\   linesrY   rQ   poprangelenplanrO   )selfrA   rB   rC   rD   rE   r\   	out_namesplanning_statespast_planning_statesiliner   rZ   r   memory_plan_reuse_patchedt   sF   






rr   skip_cudagraphsc                    s  ddl m} ddlm}m m ddlm ddlm	} g }|j
 } dtdtf fdd	tt|t|D ]\}}	| }
|D ]
}|
|j  qI|
|}|jd
d |D }|fdd|j|jB D |
 }|fdd|D }| |D ]}|j qfdd|
 D }|| fdd|D }fdd|D }fdd|D }|| |fdd|D }fdd|D }fdd|D }||}||||||	|}|| ||| }q@|ddd S )z
    Gets signature for each graph partition, including input nodes, output nodes, and
    whether deallocating an input within graph partition.
    r   )dependencies)GraphPartitionSignatureMutationOutput
NoneLayoutrF   )
OrderedSetbuf_namerH   c                    sV   j | d}|du rdS t|jjr)t|j r'j| d }r'|S dS dS )z
        Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated
        so graph partition should not take it as inputs or outputs.
        NFT)name_to_bufr+   rM   rY   layoutmutation_real_name)ry   buf	real_name)rv   rw   is_none_layoutrl   r   r   r      s   z=get_graph_partition_signature_patched.<locals>.is_none_layoutc                 S   s   g | ]}|j qS r   )read_writes)r%   rY   r   r   r   r)      s    z9get_graph_partition_signature_patched.<locals>.<listcomp>c                    s   g | ]
} |j s|j qS r   )rQ   )r%   x)r   r   r   r)      s    c                 3       | ]
} j ||V  qd S Nr|   r+   r%   rQ   rl   r   r   	<genexpr>       
z8get_graph_partition_signature_patched.<locals>.<genexpr>c                    s   g | ]}| v r|qS r   r   r   name_to_noder   r   r)     s
    c                    s   i | ]}| v r| | qS r   r   r   r   r   r   
<dictcomp>  
    z9get_graph_partition_signature_patched.<locals>.<dictcomp>c                    s   i | ]}|v r|| v qS r   r   r   buffer_names_to_freer   r   r   r     r   c                    s    g | ]}|v r| vr|qS r   r   r   r   r   r   r)     s
    c                 3   r   r   r   r   r   r   r   r   #  r   c                    s   g | ]
} |s| qS r   r   r   )r   r   r   r   r)   '  s    c                    s   g | ]
}| j jv r|qS r   )rP   	constantsr   rF   r   r   r)   -  s    Nr]   )torch._inductorrt   r^   ru   rv   rw   ra   rG   torch.utils._ordered_setrx   rP   r\   get_name_to_nodesrc   boolzipreversedupdateoutputs_by_namekeysintersection
ReadWrites
merge_listreadswrites
last_usage!get_graph_partition_symbol_inputsrO   union)rl   
partitionsrs   rt   ru   rx   
signaturesunmet_output_names	partitionskip_cudagraphoutput_namesrY   returned_output_namesr   partition_input_namesextra_input_namesinput_nodesinput_deallocationextra_output_namesrf   constant_namessymbol_inputspartition_signaturer   )rv   rw   rG   r   r   r   rl   r   %get_graph_partition_signature_patched   s   









	r   F
should_logrH   c                    s  ddl m  m} ddlm}m} ddlm}m}m	} |j
}	t|	tjjjrY|	j }
rY|
 }t|
tjjr>| d|
j n|}|tjjjv sN|tjjjv rYt|
tjjsWJ dS tjjjjsf|jdu rfdS dtd|dB d	dfd
d}|rw|n|}t||rt fdd|jD S |j
dusJ | s|d|d dS t|j
|jr|d|d dS t|j
|jr|d|d dS t|j
ddr|d|d dS ||j
r|d|d dS dS )zBReturn True if we should partition the inductor graph on this noder   N)BaseSchedulerNodeFusedSchedulerNode)&_unstable_customized_partition_wrapperis_cudagraph_unsafe_opmaybe_log_cudagraph_partition.TmsgrY   rH   c                 S   s   d S r   r   )r   rY   r   r   r   noop_log  s   z*should_partition_patched.<locals>.noop_logc                 3   s    | ]}  |V  qd S r   )should_partition)r%   snoder   r   r   r     s    z+should_partition_patched.<locals>.<genexpr>znon gpu ops)rY   zDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opsF)r^   r_   r[   torch._inductor.schedulerr   r   torch._inductor.utilsr   r   r   rY   rM   r   FallbackKernelop_overloadrQ   _ops
OpOverload_overloadnameconfigcustom_should_partition_opstriton
cudagraphswrapperrc   anysnodesis_gpu
DeviceCopyConditionalr   )rl   rY   r   r[   r   r   r   r   r   ir_nodeopop_overload_packet_nameop_overload_namer   log_partition_reasonr   r   r   should_partition_patchedZ  sV   



r   c                 C   sh   ddl m  m} ddlm} t|_t|_|	dd || j
| _W d   dS 1 s-w   Y  dS )z
    (Re)initializes the scheduler member.  When initializing the scheduler, no CUBIN
    files should be generated (to avoid biasing any benchmarks and pessimizing
    fusion decisions).
    r   N)	Schedulerztriton.store_cubinF)torch._inductor.configr_   r   r   r   r   r   r   get_graph_partition_signaturepatch
operations	scheduler)rl   r   r   r   r   r   _update_scheduler_patched  s   	"r   c                  C   sN   ddl m}  | ds| dr%ddl}ttjdr#ddlm} ||_dS dS dS )z;Workaround for TorchInductor autotune get_raw_stream() bug.r   r<   2.9.0z2.9.1N_cuda_getCurrentRawStream)r   )	vllm.utils.torch_utilsr=   builtinshasattrr   _Ctorch._Cr   get_raw_stream)r=   r   _get_raw_streamr   r   r   _patch_get_raw_stream_if_needed  s   
r   r   )PythonWrapperCodegen)GraphLowering)_Config_ConfigEntry)defaultr   )F)rH   N)!importlib.utilr   r   r   r:   r   vllm.loggerr;   r   r=   __name__loggerr*   r_   r   compile_threadsrr   rb   r   r   r   r   r   r`   r   torch._inductor.graphr   torch.utils._config_moduler   r   _configmemory_plan_reuse_update_schedulerr   r   r   r   <module>   s>   .
	


D
 #
S
