o
    :/iZ                     @   s   d Z ddlZddlmZ ddlmZ ddlmZmZ ddl	m
Z
mZ ddlZddlmZmZ ddlmZ dd	lmZ ddlZdd
lmZ ddlmZ ddlmZ ddlmZ ddlmZm Z m!Z! e
ruddl"m#Z# ddl$m%Z% ddl&m'Z' ndZ#dZ%ee(Z)edZ*edZ+e Z,ej-j./d e	d&de0dede1dB de2e fddZ3dee*e+f dee*e+f fddZ4G dd  d e Z5G d!d" d"e5Z6G d#d$ d$e5Z7dZ8zze,9  d%Z8W n e:y   dZ8Y nw W e8re,;  ne8re,;  w w e8re6ne7Z<e<=  dS )'z~Code inside this file can safely assume cuda platform, e.g. importing
pynvml. However, it should not initialize cuda context.
    N)Callable)	timedelta)cachewraps)TYPE_CHECKINGTypeVar)PrefixStoreProcessGroup)is_nccl_available)	ParamSpec)init_logger)import_pynvmlcuda_device_count_stateless)AttentionBackendEnum   )DeviceCapabilityPlatformPlatformEnum)
VllmConfig)
CacheDType)AttentionSelectorConfig_P_RFuse_mladevice_capability	num_headsreturnc                 C   s   | r6|j dkr*|dur|dkrtjtjg}ntjtjg}tjtjtjtjtjg|S tjtjtjtjtjgS |j dkrEtj	tj
tjtjgS tj
tj	tjtjgS )zEGet backend priorities with lazy import to avoid circular dependency.
   N   )majorr   FLASHINFER_MLA_SPARSEFLASHMLA_SPARSEFLASHINFER_MLACUTLASS_MLAFLASH_ATTN_MLAFLASHMLA
TRITON_MLA
FLASHINFER
FLASH_ATTNTRITON_ATTNFLEX_ATTENTION)r   r   r   sparse_backends r-   `/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/platforms/cuda.py_get_backend_priorities/   sD   


r/   fnc                    s*   t  dtjdtjdtf fdd}|S )Nargskwargsr   c                     s,   t   z | i |W t   S t   w N)pynvmlnvmlInitnvmlShutdown)r1   r2   r0   r-   r.   wrappere   s   z"with_nvml_context.<locals>.wrapper)r   r   r1   r2   r   )r0   r8   r-   r7   r.   with_nvml_contextd   s    r9   c                   @   sH  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< dZeed	< d
Zeed< dgZee ed< edeej fddZedejddfddZed`dededB fddZed`dedefddZed`dedefddZedee defddZed d! Zedad$d%Ze	dbdej j!dB de"fd&d'Z#e	dbd(ed)d*d+edB de$ee$d,ef  e%d,e$eee f f f fd-d.Z&e	dbd/d0d)d*d+edB defd1d2Z'eded, fd3d4Z(e	dbd5ed6ejd7d0dd,fd8d9Z)edefd:d;Z*edefd<d=Z+edefd>d?Z,edefd@dAZ-edefdBdCZ.edefdDdEZ/ed7edFe0dGedHedIe1de2fdJdKZ3edefdLdMZ4ed6ejfdNdOZ5edPej6dQej6dRej6dSej6ddf
dTdUZ7edPej6dQej6dRej6dSej6ddf
dVdWZ8edefdXdYZ9edefdZd[Z:ed`dedefd\d]Z;edefd^d_Z<dS )cCudaPlatformBasecudadevice_namedevice_typeCUDAdispatch_keyGPUray_device_keynccldist_backendCUDA_VISIBLE_DEVICESdevice_control_env_var+RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICESray_noset_device_env_varsr   c                 C   s8   |  drtjtjtjgS |  drtjtjgS tjgS )NP   <   )has_device_capabilitytorchbfloat16float16float32)selfr-   r-   r.   supported_dtypes|   s
   

z!CudaPlatformBase.supported_dtypesdeviceNc                 C   s   t j| t jd|d}dS )z:
        Set the device for the current platform.
        r   )rQ   N)rK   r;   
set_devicezeros)clsrQ   _r-   r-   r.   rR      s   zCudaPlatformBase.set_devicer   	device_idc                 C      t r3   NotImplementedErrorrT   rV   r-   r-   r.   get_device_capability      z&CudaPlatformBase.get_device_capabilityc                 C   rW   r3   rX   rZ   r-   r-   r.   get_device_name   r\   z CudaPlatformBase.get_device_namec                 C   rW   r3   rX   rZ   r-   r-   r.   get_device_total_memory   r\   z(CudaPlatformBase.get_device_total_memory
device_idsc                 C   rW   r3   rX   )rT   r_   r-   r-   r.   is_fully_connected   r\   z#CudaPlatformBase.is_fully_connectedc                 C   s   d S r3   r-   rT   r-   r-   r.   log_warnings   r\   zCudaPlatformBase.log_warningsvllm_configr   c                 C   s`   |j }|j}|jdkrd|_|j}|d ur(|jr*|jr,|js.td d|_d S d S d S d S d S )Nautoz vllm.v1.worker.gpu_worker.WorkerzVForcing --disable_chunked_mm_input for models with multimodal-bidirectional attention.T)	parallel_configmodel_config
worker_clsscheduler_configis_mm_prefix_lmis_multimodal_modeldisable_chunked_mm_inputloggerwarning)rT   rc   re   rf   rh   r-   r-   r.   check_and_update_config   s"   

z(CudaPlatformBase.check_and_update_configc                 C   s"   t j  t j| t j|S r3   )rK   r;   empty_cachereset_peak_memory_statsmax_memory_allocated)rT   rQ   r-   r-   r.   get_current_memory_usage   s   
z)CudaPlatformBase.get_current_memory_usager   attn_selector_configr   r   r   c              	   C   s   g }i }t |j||}t|D ]3\}}z| }	|	jdd|i| }
W n ty1   dg}
Y nw |
r;||
f||< q|||f q||fS )Nr   ImportErrorr-   )r/   r   	enumerate	get_classvalidate_configuration_asdictrt   append)rT   r   rs   r   valid_backends_prioritiesinvalid_reasonsbackend_prioritiesprioritybackendbackend_classinvalid_reasons_ir-   r-   r.   get_valid_backends   s,   


z#CudaPlatformBase.get_valid_backendsselected_backendzAttentionBackendEnum | Nonec              	      s  |   }|d us
J |d urCz| }|jd d|i| }W n ty,   dg}Y nw |r9td| d| td| | S | j	|||d\}dd
d	d
 | D  d }| }	td| j d|	 d| d tdkrtd| j d|	 d| dtttfddd}
|
d }| d }| d  |jd urʇ fdd| D }|rd
dd
 |D }td|j||j tjd|jdd
dd
 D  d dd | S )!Nr   rt   zSelected backend z. is not valid for this configuration. Reason: zUsing %s backend.)r   rs   r   {, c                 s   s0    | ]\}\}}|j  d d| dV  qdS )z: [r   ]N)namejoin).0r~   rU   reasonsr-   r-   r.   	<genexpr>  s
    

z8CudaPlatformBase.get_attn_backend_cls.<locals>.<genexpr>}z*Some attention backends are not valid for z with z. Reasons: .r   z%No valid attention backend found for c                    s    |  d S )Nr   r-   )i)rz   r-   r.   <lambda>"  s    z7CudaPlatformBase.get_attn_backend_cls.<locals>.<lambda>)keyr   c                    s*   g | ]\}\}}| k r|d gkr|qS )zblock_size not supportedr-   )r   r~   r}   r   )selected_priorityr-   r.   
<listcomp>+  s    

z9CudaPlatformBase.get_attn_backend_cls.<locals>.<listcomp>c                 s   s    | ]}|j V  qd S r3   r   r   br-   r-   r.   r   2  s    z--block-size %d precluded higher-priority backend(s) %s. Using %s instead, which may result in reduced performance. Consider removing --block-size to auto-select the optimal block size.z9Using %s attention backend out of potential backends: %s.[c                 s   s"    | ]}d |d j  d V  qdS )'r   Nr   r   r-   r-   r.   r   @  s     r   local)scoper-   )r[   rv   rw   rx   rt   
ValueErrorrl   infoget_pathr   r   items__repr__
debug_oncer<   lensortedrange
block_sizerm   r   	info_once)rT   r   rs   r   r   r   r{   all_invalid_reasonsreasons_str
config_strsorted_indicesselected_indexexcludednamesr-   )r   rz   r.   get_attn_backend_cls   s   










z%CudaPlatformBase.get_attn_backend_clsc                 C   s2   |  drtjtjtjtjgS tjtjtjtjgS )NrH   )rJ   r   r)   r*   
TORCH_SDPAr(   ra   r-   r-   r.   get_supported_vit_attn_backendsF  s   
z0CudaPlatformBase.get_supported_vit_attn_backends	head_sizedtyper~   c              	   C   s   |d ur!||   v sJ d| d|    td| d |S |  }|   D ]?}|tjkr4|  S z+| }||oB||}|d urN|oM|	|}|r^td| d |W   S W q) t
yh   Y q)w tjS )NzBackend z= is not supported for vit attention. Supported backends are: zUsing backend z for vit attention)r   rl   r   r[   r   r   rv   supports_head_sizesupports_dtypesupports_compute_capabilityrt   )rT   r   r   r~   ccvit_attn_backendr   is_backend_supportedr-   r-   r.   get_vit_attn_backendW  sB   


z%CudaPlatformBase.get_vit_attn_backendc                 C      dS )Nz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPUr-   ra   r-   r-   r.   get_punica_wrapper~  r\   z#CudaPlatformBase.get_punica_wrapperc                 C   s   	 dS )NzHvllm.distributed.device_communicators.cuda_communicator.CudaCommunicatorr-   ra   r-   r-   r.   get_device_communicator_cls  s   z,CudaPlatformBase.get_device_communicator_clsc                 C   s
   |  dS )NY   )rJ   ra   r-   r-   r.   supports_fp8  s   
zCudaPlatformBase.supports_fp8c                 C   r   NTr-   ra   r-   r-   r.   use_custom_allreduce  r\   z%CudaPlatformBase.use_custom_allreducec                 C   r   r   r-   ra   r-   r-   r.   opaque_attention_op  r\   z$CudaPlatformBase.opaque_attention_opc                 C   r   )Nz,vllm.compilation.cuda_graph.CUDAGraphWrapperr-   ra   r-   r-   r.   get_static_graph_wrapper_cls  r\   z-CudaPlatformBase.get_static_graph_wrapper_clsprefix_store
group_rank
group_sizetimeoutc                 C   st   t  sJ t|||}ddlm} | }||_|||||}	tjj}
t	d}|
|
 |	  |||
|	 |S )Nr   )ProcessGroupNCCLr;   )r
   r	   "torch.distributed.distributed_c10dr   Options_timeoutBackendTypeNCCLrK   rQ   _set_default_backend_set_sequence_number_for_group_register_backend)rT   r~   r   r   r   r   pgr   backend_optionsr   backend_typerQ   r-   r-   r.   #stateless_init_device_torch_dist_pg  s$   
	

z4CudaPlatformBase.stateless_init_device_torch_dist_pgc                 C   s   t  S r3   r   ra   r-   r-   r.   device_count  s   zCudaPlatformBase.device_countc                 C   sb   |t jkr-| ds/|  }|  }|d u rd}n	| }d| }td| d| dd S d S )NrH   z"does not have a compute capabilityzhas compute capability zQBfloat16 is only supported on GPUs with compute capability of at least 8.0. Your z GPU zg. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.)rK   rL   rJ   r[   r]   as_version_strr   )rT   r   
capabilitygpu_namecompute_strversion_strr-   r-   r.   check_if_supports_dtype  s"   


z(CudaPlatformBase.check_if_supports_dtype	src_cache	dst_cachesrc_block_indicesdst_block_indicesc                 C   s,   |dd|f }| |j|dd|f< dS )z/Copy blocks from src_cache to dst_cache on GPU.N)torQ   rT   r   r   r   r   
_src_cacher-   r-   r.   insert_blocks_to_device  s   	z(CudaPlatformBase.insert_blocks_to_devicec                 C   s(   |dd|f }|  |dd|f< dS )z#Copy blocks from GPU to host (CPU).N)cpur   r-   r-   r.   swap_out_blocks_to_host  s   	z(CudaPlatformBase.swap_out_blocks_to_hostc                 C   r   r   r-   ra   r-   r-   r.   support_hybrid_kv_cache  r\   z(CudaPlatformBase.support_hybrid_kv_cachec                 C   r   r   r-   ra   r-   r-   r.   support_static_graph_mode  r\   z*CudaPlatformBase.support_static_graph_modec                 C   s   t j|jS r3   )rK   r;   get_device_propertiesmulti_processor_countrZ   r-   r-   r.   num_compute_units  s   z"CudaPlatformBase.num_compute_unitsc                 C   r   r   r-   ra   r-   r-   r.   use_custom_op_collectives  r\   z*CudaPlatformBase.use_custom_op_collectivesr   )rc   r   r   Nr3   )=__name__
__module____qualname__r   r>   _enumr<   str__annotations__r=   r?   rA   rC   rE   rG   listpropertyrK   r   rP   classmethodrQ   rR   intr   r[   r]   r^   boolr`   rb   rn   typesDevicefloatrr   tupledictr   r   r   r   r   r   r   r   r   r   r   r   r	   r   r   r   Tensorr   r   r   r   r   r   r-   r-   r-   r.   r:   p   s  
 


!^&r:   c                	       s   e Zd ZeeeddededB fddZee	dde	eef eB dede
f fdd	Zeeddedefd
dZeeddedefddZeeddedefddZeedee de
fddZeddedefddZeedd Z  ZS )NvmlCudaPlatformr   rV   r   Nc                 C   sF   z|  |}t|}t|\}}t||dW S  ty"   Y d S w N)r    minor)device_id_to_physical_device_idr4   nvmlDeviceGetHandleByIndex"nvmlDeviceGetCudaComputeCapabilityr   RuntimeError)rT   rV   physical_device_idhandler    r   r-   r-   r.   r[     s   

z&NvmlCudaPlatform.get_device_capabilityr   c                    s&   zt  ||W S  ty   Y dS w )NF)superrJ   r   )rT   r   rV   	__class__r-   r.   rJ   
  s
   z&NvmlCudaPlatform.has_device_capabilityc                 C   s   |  |}| |S r3   )r   _get_physical_device_name)rT   rV   r  r-   r-   r.   r]     s   

z NvmlCudaPlatform.get_device_namec                 C   s   |  |}t|}t|S r3   )r   r4   r   nvmlDeviceGetUUIDrT   rV   r  r  r-   r-   r.   get_device_uuid  s   


z NvmlCudaPlatform.get_device_uuidc                 C   s$   |  |}t|}tt|jS r3   )r   r4   r   r   nvmlDeviceGetMemoryInfototalr  r-   r-   r.   r^   #  s   

z(NvmlCudaPlatform.get_device_total_memoryphysical_device_idsc              
   C   s   dd |D }t |D ]8\}}t |D ]/\}}||k rBzt||tj}|tjkr.W   dS W q tjyA   td Y   dS w qqdS )zP
        query if the set of gpus are fully connected by nvlink (1 hop)
        c                 S   s   g | ]}t |qS r-   )r4   r   r   r   r-   r-   r.   r   0      z7NvmlCudaPlatform.is_fully_connected.<locals>.<listcomp>FzONVLink detection failed. This is normal if your machine has no NVLink equipped.T)ru   r4   nvmlDeviceGetP2PStatusNVML_P2P_CAPS_INDEX_NVLINKNVML_P2P_STATUS_OK	NVMLErrorrl   	exception)rT   r  handlesr   r  jpeer_handle
p2p_statusr-   r-   r.   r`   *  s,   


z#NvmlCudaPlatform.is_fully_connectedc                 C   s   t |}t |S r3   )r4   r   nvmlDeviceGetName)rT   rV   r  r-   r-   r.   r  D  s   

z*NvmlCudaPlatform._get_physical_device_namec                    sh   t  }|dkr. fddt|D }tt|dkr0tjddkr2t	dd
| d S d S d S d S )Nr   c                    s   g | ]}  |qS r-   )r  r  ra   r-   r.   r   N  r  z1NvmlCudaPlatform.log_warnings.<locals>.<listcomp>CUDA_DEVICE_ORDER
PCI_BUS_IDzDetected different devices in the system: %s. Please make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to avoid unexpected behavior.r   )r4   nvmlDeviceGetCountr   r   setosenvirongetrl   rm   r   )rT   r_   device_namesr-   ra   r.   rb   I  s   zNvmlCudaPlatform.log_warningsr   )r   r   r   r   r   r9   r   r   r[   r   r   rJ   r   r]   r	  r^   r   r`   r  rb   __classcell__r-   r-   r  r.   r     s@    	
r   c                   @   sr   e Zd ZeeddedefddZeddedefddZ	eddedefdd	Z
ed
ee defddZdS )NonNvmlCudaPlatformr   rV   r   c                 C   s   t j|\}}t||dS r   )rK   r;   r[   r   )rT   rV   r    r   r-   r-   r.   r[   \  s   z)NonNvmlCudaPlatform.get_device_capabilityc                 C   s   t j|S r3   )rK   r;   r]   rZ   r-   r-   r.   r]   b  s   z#NonNvmlCudaPlatform.get_device_namec                 C   s   t j|}|jS r3   )rK   r;   r   total_memory)rT   rV   device_propsr-   r-   r.   r^   f  s   z+NonNvmlCudaPlatform.get_device_total_memoryr  c                 C   s   t d dS )Nz^NVLink detection not possible, as context support was not found. Assuming no NVLink available.F)rl   r  )rT   r  r-   r-   r.   r`   k  s   z&NonNvmlCudaPlatform.is_fully_connectedNr   )r   r   r   r   r   r   r   r[   r   r]   r^   r   r   r`   r-   r-   r-   r.   r"  [  s    r"  Tr3   )>__doc__r  collections.abcr   datetimer   	functoolsr   r   typingr   r   rK   torch.distributedr   r	   r   r
   typing_extensionsr   vllm._Cvllmvllm.loggerr   vllm.utils.import_utilsr   vllm.utils.torch_utilsr   #vllm.v1.attention.backends.registryr   	interfacer   r   r   vllm.configr   vllm.config.cacher   vllm.v1.attention.selectorr   r   rl   r   r   r4   backendsr;   enable_cudnn_sdpr   r   r   r/   r9   r:   r   r"  nvml_availabler5   	Exceptionr6   CudaPlatformrb   r-   r-   r-   r.   <module>   s|   "4   ^
