o
    :/i{.                     @   s   d dl Z d dlZd dlmZ d dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZ ddlmZmZmZ erId dlmZ d d	lmZ ndZe
eZG d
d deZdS )    N)TYPE_CHECKING)init_logger)supports_xpu_graph)AttentionBackendEnum   )DeviceCapabilityPlatformPlatformEnum)
VllmConfig)AttentionSelectorConfigc                   @   s  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< dZeed	< d
Zeed< edTddZe	dUdddddedB defddZeded fddZe	dUdedejddddfddZedejddfd d!Ze	"dVd#ededB fd$d%ZedVd#edefd&d'Zedefd(d)ZedVd#edefd*d+Zed,d- Zedefd.d/Zed0e ddfd1d2Z!edWd4d5Z"ede#fd6d7Z$ede#fd8d9Z%ed:d; Z&e	dUdej'j(dB de)fd<d=Z*edejfd>d?Z+ede#fd@dAZ,edefdBdCZ-edefdDdEZ.edejfdFdGZ/ede#fdHdIZ0edJej1dKej1dLej1dMej1ddf
dNdOZ2edJej1dKej1dLej1dMej1ddf
dPdQZ3edVd#edefdRdSZ4dS )XXPUPlatformxpudevice_namedevice_typeXPUdispatch_keyGPUray_device_keyxccldist_backendZE_AFFINITY_MASKdevice_control_env_varreturnNc                 C   s8   t t dd l}W d    d S 1 sw   Y  d S )Nr   )
contextlibsuppressImportErrorvllm._moe_C)clsvllm r   _/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/platforms/xpu.pyimport_kernels)   s   
"zXPUPlatform.import_kernelsselected_backendr   attn_selector_configr   	num_headsc                 C   s   ddl m} |d td |j}|jrtd tj	 S |j
r,td tj	 S |tjkr;td tj	 S |tjkrJtd tj	 S |tjkrYtd	 tj	 S |rgtd
| j d|j
 td	 tj	 S )Nr   )set_kv_cache_layoutNHDzeSetting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; only NHD layout is supported by XPU attention kernels.zUsing XPU MLA Sparse backend.z&Using Triton MLA backend on V1 engine.zUsing Triton backend.z`Flash Attention on XPU does not support float32 dtype. Falling back to Triton Attention backend.zUsing Flash Attention backend.zInvalid attention backend for z, with use_mla: ) vllm.v1.attention.backends.utilsr%   loggerinfodtype
use_sparse	info_oncer   XPU_MLA_SPARSEget_pathuse_mla
TRITON_MLATRITON_ATTNtorchfloat32warning_once
FLASH_ATTN
ValueErrorr   )r   r"   r#   r$   r%   r*   r   r   r    get_attn_backend_cls/   s>   














z XPUPlatform.get_attn_backend_clsc                 C   s   t jt jt jgS N)r   r5   r1   
TORCH_SDPAr   r   r   r    get_supported_vit_attn_backendsZ   s   z+XPUPlatform.get_supported_vit_attn_backends	head_sizer*   backendzAttentionBackendEnum | Nonec                 C   s^   |d ur"||   v sJ d| d|    dtd| d |S tdtj d tjS )NzBackend z= is not supported for vit attention. Supported backends are: .zUsing backend z for vit attention)r;   r(   r,   r   r5   )r   r<   r*   r=   r   r   r    get_vit_attn_backendb   s   z XPUPlatform.get_vit_attn_backenddevicec                 C   s   t j| dS )z:
        Set the device for the current platform.
        N)r2   r   
set_devicer   r@   r   r   r    rA   w   s   zXPUPlatform.set_devicer   	device_idc                 C      d S r8   r   r   rC   r   r   r    get_device_capability~   s   z!XPUPlatform.get_device_capabilityc                 C   s   t j|S r8   )r2   r   get_device_namerE   r   r   r    rG      s   zXPUPlatform.get_device_namec                 C   s   t dddk}|sdS dS )NXPU_USE_TRITON_KERNEL01z4vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPUz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU)osgetenv)r   xpu_use_triton_kernelr   r   r    get_punica_wrapper   s   zXPUPlatform.get_punica_wrapperc                 C   s   t j|}|jS r8   )r2   r   get_device_propertiestotal_memory)r   rC   device_propsr   r   r    get_device_total_memory   s   z#XPUPlatform.get_device_total_memoryc                 C   s   t  S r8   )r2   no_gradr:   r   r   r    inference_mode   s   zXPUPlatform.inference_modec                 C      dS )Nz,vllm.compilation.cuda_graph.CUDAGraphWrapperr   r:   r   r   r    get_static_graph_wrapper_cls      z(XPUPlatform.get_static_graph_wrapper_clsvllm_configc                 C   s,  |j }|j}|j}|r|jsd|_ddlm} |j}|jd u r"g |_|j	}|j
d u r.tj|_
t s;|j|_td n'|jdkrJ|j|_td n|j
tjkrb|j|j|jhvrb|j|_td |j}|jdkrmd	|_|jd urvd
|j_|r|jrtd d|j_t|jj|jj|j_dtj d< d S )N@   r   )CUDAGraphModezTXPU Graph is not supported in the current PyTorch version, disabling cudagraph_mode.r   zNXPU Graph doesn't support capture communication ops, disabling cudagraph_mode.zoFMHA sycl-tla kernels cannot be captured with XPU graphs, falling back to PIECEWISE graph mode on XPU platform.autoz#vllm.v1.worker.xpu_worker.XPUWorkerTz`MLA is enabled on a non-GPU platform; forcing chunked prefill and prefix caching to be disabled.FnUCX_MEMTYPE_CACHE)!cache_configmodel_configparallel_configuser_specified_block_size
block_sizevllm.configrZ   compilation_configcompile_sizesattention_configr=   r   r5   r   NONEcudagraph_moder(   warningworld_size_across_dp	PIECEWISE
worker_clskv_transfer_configenable_permute_local_kvr/   r)   scheduler_configenable_chunked_prefillmaxmax_model_lenDEFAULT_MAX_NUM_BATCHED_TOKENSmax_num_batched_tokensrK   environ)r   rX   r^   r_   r`   rZ   rd   rf   r   r   r    check_and_update_config   sZ   







	z#XPUPlatform.check_and_update_configr
   c                 C   rD   r8   r   )r   rX   r   r   r    update_block_size_for_backend   s   z)XPUPlatform.update_block_size_for_backendc                 C   rU   NTr   r:   r   r   r    support_hybrid_kv_cache   rW   z#XPUPlatform.support_hybrid_kv_cachec                 C   rU   rx   r   r:   r   r   r    support_static_graph_mode   rW   z%XPUPlatform.support_static_graph_modec                 C   rU   rx   r   r:   r   r   r    is_pin_memory_available   rW   z#XPUPlatform.is_pin_memory_availablec                 C   s   t j| t j|S r8   )r2   r   reset_peak_memory_statsmax_memory_allocatedrB   r   r   r    get_current_memory_usage   s   z$XPUPlatform.get_current_memory_usagec                 C   s   t jS r8   )r2   float8_e4m3fnr:   r   r   r    	fp8_dtype   s   zXPUPlatform.fp8_dtypec                 C   s   |    }|ddkS )Nzdata center gpur   )rG   lowercount)r   r   r   r   r    is_data_center_gpu  s   zXPUPlatform.is_data_center_gpuc                 C   s    ddl m} | std dS )Nr   )supports_xcclzHxccl is not enabled in this torch build, communication is not available.zFvllm.distributed.device_communicators.xpu_communicator.XpuCommunicator)vllm.utils.torch_utilsr   r(   ri   )r   r   r   r   r    get_device_communicator_cls  s   z'XPUPlatform.get_device_communicator_clsc                 C   s
   t j S r8   )r2   r   device_countr:   r   r   r    r     s   
zXPUPlatform.device_countc                 C   s4   |t jkr|   }|ddkrtdd S d S )Na770r   zIntel Arc A770 have bfloat16 accuracy known issue. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.)r2   bfloat16rG   r   r   r6   )r   r*   r   r   r   r    check_if_supports_dtype  s   
z#XPUPlatform.check_if_supports_dtypec                 C   rU   rx   r   r:   r   r   r    opaque_attention_op"  rW   zXPUPlatform.opaque_attention_op	src_cache	dst_cachesrc_block_indicesdst_block_indicesc                 C   s,   |dd|f }| |j|dd|f< dS )z/Copy blocks from src_cache to dst_cache on XPU.N)tor@   r   r   r   r   r   
_src_cacher   r   r    insert_blocks_to_device&  s   	z#XPUPlatform.insert_blocks_to_devicec                 C   s(   |dd|f }|  |dd|f< dS )z#Copy blocks from XPU to host (CPU).N)cpur   r   r   r    swap_out_blocks_to_host2  s   	z#XPUPlatform.swap_out_blocks_to_hostc                 C   s   t j|jS r8   )r2   r   rO   max_compute_unitsrE   r   r   r    num_compute_units>  s   zXPUPlatform.num_compute_units)r   Nr8   )r   )rX   r
   r   N)5__name__
__module____qualname__r	   r   _enumr   str__annotations__r   r   r   r   r   classmethodr!   intr7   listr;   r2   r*   r?   r@   rA   r   rF   rG   rN   rR   rT   rV   r
   rv   rw   boolry   rz   r{   typesDevicefloatr~   r   r   r   r   r   r   Tensorr   r   r   r   r   r   r    r      s   
 *
D


r   )r   rK   typingr   r2   vllm_xpu_kernels._Cvllm_xpu_kernelsvllm_xpu_kernels._moe_Cvllm_xpu_kernels._xpu_Cvllm.loggerr   r   r   #vllm.v1.attention.backends.registryr   	interfacer   r   r	   rc   r
   vllm.v1.attention.selectorr   r   r(   r   r   r   r   r    <module>   s"   