o :/Êi§Fã@sddlZddlZdd„Zdd„ZeƒddlZddlmZddlm Z ee ƒZdejd <dejd <dej j_dd „Zdeefdd„Zd dedefdd„Zd!dd„Zdd„Zeƒe dƒr…ddlmZddlmZddlmZmZeegdƒej jjd<ee_ee_ dSdS)"éNcCs¾zTtj d¡}|sWdS|jrtj |j¡}n|jr |jd}nWdStj |d¡}tj |¡s3WdStj d|¡}|r?|jsBWdStj |¡}|j |¡t|ddƒWSty^YdSw)zôPeripheral function to _maybe_set_cuda_compatibility_path(). PyTorch version must not be determined by importing directly because it will trigger the CUDA initialization, losing the chance to set the LD_LIBRARY_PATH beforehand. ÚtorchNrz version.pyz torch.versionÚcuda)Ú importlibÚutilÚ find_specÚoriginÚosÚpathÚdirnameÚsubmodule_search_locationsÚjoinÚexistsÚspec_from_file_locationÚloaderÚmodule_from_specÚexec_moduleÚgetattrÚ Exception)ÚspecÚ torch_rootÚversion_pathÚver_specÚmodule©rú^/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/env_override.pyÚ_get_torch_cuda_versions* ÿrc s:tj dd¡ ¡ ¡dv}|sdStj dd¡}|r tj |¡s8tj dd¡}tj |d¡}|r8tj |¡r8|}|r@tj |¡sStƒ}|rSd |›d }tj |¡rS|}|r[tj |¡s]dStj |¡‰tj dd¡}|rr| tj¡ng}|r†|dr†tj |d¡ˆkr†dSˆg‡fd d„|Dƒ}tj |¡tjd<dS)aÉSet LD_LIBRARY_PATH for CUDA forward compatibility if enabled. Must run before 'import torch' since torch loads CUDA shared libraries at import time and the dynamic linker only consults LD_LIBRARY_PATH when a library is first loaded. CUDA forward compatibility is only supported on select professional and datacenter NVIDIA GPUs. Consumer GPUs (GeForce, RTX) do not support it and will get Error 803 if compat libs are loaded. ÚVLLM_ENABLE_CUDA_COMPATIBILITYÚ0)Ú1ÚtrueNÚVLLM_CUDA_COMPATIBILITY_PATHÚÚCONDA_PREFIXzcuda-compatz/usr/local/cuda-z/compatÚLD_LIBRARY_PATHrcs$g|]}|rtj |¡ˆkr|‘qSr)rr Únormpath)Ú.0Úp©Ú norm_pathrrÚ Osÿz6_maybe_set_cuda_compatibility_path..)rÚenvironÚgetÚstripÚlowerr Úisdirrrr$ÚsplitÚpathsep) ÚenableÚcuda_compat_pathÚconda_prefixÚconda_compatÚtorch_cuda_versionÚdefault_pathÚexistingÚld_pathsÚ new_pathsrr'rÚ"_maybe_set_cuda_compatibility_path's4 ÿr:)Úinit_logger©Úis_torch_equalrÚPYTORCH_NVML_BASED_CUDA_CHECKÚTORCHINDUCTOR_COMPILE_THREADSécs€ddlmm‰ddlm}m}m}m}m}ddl m ‰dttf‡‡fdd„}t ˆjj|ƒr@ˆjjjdur@|ˆjjjjƒ}nˆj ¡}|jrrt |jd|ƒrr|jdjj|vrr|j ¡|jrrt |jd|ƒrr|jdjj|vsY|ƒg}g} tt|jƒƒD]/} |j| }t ||ƒr–| |d¡|j| <qt ||ƒr¢| |ƒ¡qt ||ƒr®| | ¡¡q| | ¡¡t|ƒdks¾J‚dS)Nr)ÚEnterSubgraphLineÚExitSubgraphLineÚMemoryPlanningLineÚMemoryPlanningStateÚSubgraphPythonWrapperCodegen©ÚVÚreturncsddl}g}| d¡}| d¡}|D]3}t|ˆjƒr)| ˆjj›dt|ƒ›¡qt|ˆjƒr>| ˆjj›dt|ƒ›¡q| | ¡¡q|S)NrÚ_noneÚ_shape) Ú itertoolsÚcountÚ isinstanceÚNoneAsConstantBufferÚappendÚgraphÚnameÚnextÚShapeAsConstantBufferÚget_name)Ú graph_outputsrKÚnamesÚ shape_counterÚnone_counterÚnode©rGÚirrrÚget_output_namess z3memory_plan_reuse_patched..get_output_nameséÿÿÿÿ)Útorch._inductor.irÚ _inductorr[Útorch._inductor.codegen.wrapperrArBrCrDrEÚtorch._inductor.virtualizedrGÚlistÚstrrMrPÚwrapper_codeÚpartition_signaturesÚoutput_nodesr\ÚlinesrYrQÚpopÚrangeÚlenÚplanrO)ÚselfrArBrCrDrEr\Ú out_namesÚplanning_statesÚpast_planning_statesÚiÚlinerrZrÚmemory_plan_reuse_patchedtsFÿ ÿ ÿþ ûÿþ €rrÚskip_cudagraphscsddlm}ddlm}m‰m‰ddlm‰ddlm }g}|ˆj ¡ƒ}ˆ ¡‰dt dtf‡‡‡‡fdd „‰tt|ƒt|ƒƒD]¹\}} |ƒ} |D] }| |j ¡¡qI| |¡}|j d d„|Dƒ¡} |‡fdd„| j| jBDƒƒ| }|‡fd d„|Dƒƒ}|ƒ‰|D]}ˆ |j¡q…‡fdd„ˆ| Dƒ}| |¡‡fdd„|Dƒ}‡‡fdd„|Dƒ}‡‡fdd„|Dƒ}| |¡|‡fdd„|Dƒƒ}‡‡fdd„|Dƒ}‡fdd„|Dƒ}ˆ ||¡}|||||| |ƒ}| |¡| ||¡}q@|ddd…S)z• Gets signature for each graph partition, including input nodes, output nodes, and whether deallocating an input within graph partition. r)Údependencies)ÚGraphPartitionSignatureÚMutationOutputÚ NoneLayoutrF)Ú OrderedSetÚbuf_namerHcsVˆj |d¡}|dur dSt|jjˆƒr)t|jˆƒr'ˆj |d¡}r'ˆ|ƒSdSdS)z Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated so graph partition should not take it as inputs or outputs. NFT)Úname_to_bufr+rMrYÚlayoutÚmutation_real_name)ryÚbufÚ real_name)rvrwÚis_none_layoutrlrrrÉsÿz=get_graph_partition_signature_patched..is_none_layoutcSsg|]}|j‘qSr)Úread_writes)r%rYrrrr)êsz9get_graph_partition_signature_patched..csg|] }ˆ|jƒs|j‘qSr)rQ)r%Úx)rrrr)ñsýÿc3ó|] }ˆj ||¡VqdS©N©r|r+©r%rQ©rlrrÚ úó€ ÿz8get_graph_partition_signature_patched..csg|]}|ˆvr|‘qSrrr…©Úname_to_noderrr)s þcsi|]}|ˆvr|ˆ|“qSrrr…r‰rrÚ ó þz9get_graph_partition_signature_patched..csi|]}|ˆvr||ˆv“qSrrr…©Úbuffer_names_to_freerŠrrr‹rŒcs g|]}|ˆvr|ˆvr|‘qSrrr…rrrr)s þc3r‚rƒr„r…r†rrr‡#rˆcsg|] }ˆ|ƒsˆ|‘qSrrr…)rrŠrrr)'sýÿcsg|] }|ˆjjvr|‘qSr)rPÚ constantsr…rFrrr)-sNr])Útorch._inductorrtr^rurvrwrarGÚtorch.utils._ordered_setrxrPr\Úget_name_to_nodesrcÚboolÚzipÚreversedÚupdateÚoutputs_by_nameÚkeysÚintersectionÚ ReadWritesÚ merge_listÚreadsÚwritesÚ last_usageÚ!get_graph_partition_symbol_inputsrOÚunion)rlÚ partitionsrsrtrurxÚ signaturesÚunmet_output_namesÚ partitionÚskip_cudagraphÚoutput_namesrYÚreturned_output_namesr€Úpartition_input_namesÚextra_input_namesÚinput_nodesÚinput_deallocationÚextra_output_namesrfÚconstant_namesÚ symbol_inputsÚpartition_signaturer)rvrwrGrŽrrŠrlrÚ%get_graph_partition_signature_patched¸sŠÿ ÿ þÿùÿÿ þ þþ þ ÿþ ÿú ÿr°FÚ should_logrHcs²ddlmm}ddlm}m}ddlm}m}m }|j } t| tjjj ƒrY| j} rY| ¡}t| tjjƒr>|›d| j›n|}|tjjjvsN|tjjjvrYt| tjjƒsWJ‚dStjjjjsf|jdurfdSdtd|dBd dfd d„} |rw|n| }t||ƒrŠt‡fdd „|jDƒƒS|j dus‘J‚| ¡s|d|ddSt|j |jƒr¬|d|ddSt|j |jƒr»|d|ddSt|j ddƒrÊ|d|ddS||j ƒr×|d|ddSdS)zBReturn True if we should partition the inductor graph on this noderN)ÚBaseSchedulerNodeÚFusedSchedulerNode)Ú&_unstable_customized_partition_wrapperÚis_cudagraph_unsafe_opÚmaybe_log_cudagraph_partitionÚ.TÚmsgrYrHcSsdSrƒr)r¸rYrrrÚnoop_log‹sz*should_partition_patched..noop_logc3s|]}ˆ |¡VqdSrƒ)Úshould_partition)r%Úsnoder†rrr‡‘s€z+should_partition_patched..znon gpu ops)rYzDeviceCopy opszConditional opsÚunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opsF)r^r_r[Útorch._inductor.schedulerr²r³Útorch._inductor.utilsr´rµr¶rYrMrÚFallbackKernelÚop_overloadrQÚ_opsÚ OpOverloadÚ _overloadnameÚconfigÚcustom_should_partition_opsÚtritonÚ cudagraphsÚwrapperrcÚanyÚsnodesÚis_gpuÚ DeviceCopyÚConditionalr)rlrYr±r[r²r³r´rµr¶Úir_nodeÚopÚop_overload_packet_nameÚop_overload_namer¹Úlog_partition_reasonrr†rÚshould_partition_patchedZsVÿÿýÿ ÿ rÓcCshddlmm}ddlm}t|_t|_| dd¡||j ƒ|_WdƒdS1s-wYdS)zÀ (Re)initializes the scheduler member. When initializing the scheduler, no CUBIN files should be generated (to avoid biasing any benchmarks and pessimizing fusion decisions). rN)Ú Schedulerztriton.store_cubinF)Útorch._inductor.configr_rÄr½rÔrÓrºr°Úget_graph_partition_signatureÚpatchÚ operationsÚ scheduler)rlrÄrÔrrrÚ_update_scheduler_patcheds "ÿrÚcCsNddlm}|dƒs|dƒr%ddl}ttjdƒr#ddlm}||_dSdSdS)z;Workaround for TorchInductor autotune get_raw_stream() bug.rr<ú2.9.0z2.9.1NÚ_cuda_getCurrentRawStream)rÜ) Úvllm.utils.torch_utilsr=ÚbuiltinsÚhasattrrÚ_CÚtorch._CrÜÚget_raw_stream)r=rÞÚ_get_raw_streamrrrÚ_patch_get_raw_stream_if_neededÆs ýûrärÛ)ÚPythonWrapperCodegen)Ú GraphLowering)Ú_ConfigÚ_ConfigEntry)ÚdefaultrÅ)F)rHN)!Úimportlib.utilrrrr:rÚvllm.loggerr;rÝr=Ú__name__Úloggerr*r_rÄÚcompile_threadsrrrbr“r°rÓrÚrär`råÚtorch._inductor.graphræÚtorch.utils._config_modulerçrèÚ_configÚmemory_plan_reuseÚ_update_schedulerrrrrÚs>. D ÿ# Sÿ ô