o
    :/iKb                     @   s  U d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
mZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ eeZejdd	Zejd
efddZejd
efddZde
de
d
efddZde d
e
dB fddZ!efde de dede
f fddZ"e"ddZ#e"ddZ$e"ddZ%e"ddZ&e"d d!Z'e"d"d#Z(e"d"d$Z)e"d"d%Z*e"d"d&Z+e"d'd(Z,e"d"d)Z-e"d*d+d,d- d.Z.d/Z/ee0d0< ejd
efd1d2Z1ejd
efd3d4Z2ejd
efd5d6Z3ejd
efd7d8Z4ejd
efd9d:Z5ejd
efd;d<Z6ejd
efd=d>Z7ejd
efd?d@Z8ejd
efdAdBZ9ejd
efdCdDZ:d
edB fdEdFZ;dGe<dHe<d
efdIdJZ=		/	/ddGe<dHe<dKe<dLe<dMe<dNe dOej>dPedQedB dRedSed
efdTdUZ?e rddVl@mAZA dWejBdXejBdYejBd
dfdZd[ZCdWejBdXejBdYejBd
dfd\d]ZDeAd^eCdWgeDd_ ejEjFd`g dadbdcejBddejBdeejBdfejBdgejBdhej>diedje d
ejBfdkdlZGejEHd`dcejBddejBdeejBdfejBdgejBdhej>diedje d
ejBfdmdnZIejEjFdog dadbdcejBddejBdeejBdfejBdhej>dje d
ejBfdpdqZJejEHdodcejBddejBdeejBdfejBdhej>dje d
ejBfdrdsZKejEjFdtg dadbduejBdvejBd
eLejBejBf fdwdxZMejEHdtduejBdvejBd
eLejBejBf fdydzZNejEjFd{g dadb	|ddcejBddejBdeejBdfejBd}ej>dje d
ejBfd~dZOejEHd{	|ddcejBddejBdeejBdfejBd}ej>dje d
ejBfddZP	|dduejBdejBdejBdejBd}ej>dje d
ejBfddZQduejBdejBdejBdejBdejBd}ej>dje d
ejBfddZR	dduejBdejBdejBdejBd}ej>dejBdB d
ejBfddZSduejBdvejBd
eLejBejBf fddZTe"ddZUejd
efddZVejd
efddZWdedej>dejBdejBfddZXg dZYdS )zoCompatibility wrapper for FlashInfer API changes.

Users of vLLM should always import **only** these wrappers.
    N)Callable)AnyNoReturn)init_logger)vllm_is_batch_invariant)current_platformFLASHINFER_CUBINS_REPOSITORYzWhttps://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/returnc                   C   s,   t jrdS tjddurdS td dS )z7Return `True` if flashinfer-cubin package is available.Tflashinfer_cubinNz&flashinfer-cubin package was not foundF)envsVLLM_HAS_FLASHINFER_CUBIN	importlibutil	find_speclogger
debug_once r   r   b/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/utils/flashinfer.pyhas_flashinfer_cubin&   s   
r   c                   C   sD   t jddu rtd dS t s tddu r td dS dS )z8Return `True` if flashinfer-python package is available.
flashinferNz2FlashInfer unavailable since package was not foundFnvcczSFlashInfer unavailable since nvcc was not found and not using pre-downloaded cubinsT)r   r   r   r   r   r   shutilwhichr   r   r   r   has_flashinfer1   s   
r   ___c                  O   s   t d)z/Placeholder for unavailable FlashInfer backend.zFlashInfer backend is not available. Please install the package to enable FlashInfer kernels: https://github.com/flashinfer-ai/flashinfer)RuntimeError)r   r   r   r   r   _missingD   s   r   module_namec              	   C   s&   zt | W S  ttfy   Y dS w )zBSafely import a submodule and return it, or None if not available.N)r   import_moduleImportErrorModuleNotFoundError)r   r   r   r   _get_submoduleM   s
   r"   	attr_namefallback_fn.c                    s&   t jfdd  fdd}|S )z5Create a lazy import wrapper for a specific function.c                     s&   t  sd S t} | rt|  d S d S N)r   r"   getattr)mod)r#   r   r   r   	_get_impl[   s   z'_lazy_import_wrapper.<locals>._get_implc                     s*     }|d u r| i |S || i |S r%   r   )argskwargsimpl)r(   r$   r   r   wrapperb   s   z%_lazy_import_wrapper.<locals>.wrapper)	functoolscache)r   r#   r$   r,   r   )r(   r#   r$   r   r   _lazy_import_wrapperV   s   r/   flashinfer.fused_moetrtllm_bf16_moetrtllm_fp8_block_scale_moetrtllm_fp8_per_tensor_scale_moecutlass_fused_moe$flashinfer.cute_dsl.blockscaled_gemmgrouped_gemm_nt_maskedr   fp4_quantizenvfp4_batched_quantize*silu_and_mul_scaled_nvfp4_experts_quantizescaled_fp4_grouped_quantizezflashinfer.fp4_quantizationblock_scale_interleavetrtllm_fp4_block_scale_moezflashinfer.autotunerautotunec                  O   s   t  S r%   )
contextlibnullcontext)r)   r*   r   r   r   <lambda>   s    r@   )r$   F_is_fi_autotuningc                   C      t  o
tjdduS )z5Return `True` if FlashInfer comm module is available.flashinfer.commNr   r   r   r   r   r   r   r   has_flashinfer_comm   s   rE   c                  C   @   t  sdS g d} | D ]\}}t|}|rt||s dS qdS )z7Return `True` if FlashInfer mnnvl all2all is available.F))rC   Mapping)zflashinfer.comm.mnnvlMnnvlMemory)flashinfer.comm.trtllm_alltoallMnnvlMoe)rI   MoEAlltoallInfoT)rE   r"   hasattrrequired_functionsr   r#   r'   r   r   r   has_flashinfer_nvlink_two_sided      rO   c                   C   s   t  sdS tjdduS )zDReturn `True` if FlashInfer trtllm_moe_alltoall module is available.Fz#flashinfer.comm.trtllm_moe_alltoallN)rE   r   r   r   r   r   r   r   has_flashinfer_nvlink_one_sided   s   rQ   c                   C   rB   )z4Return `True` if FlashInfer MoE module is available.r0   NrD   r   r   r   r   has_flashinfer_moe   s   rR   c                   C   rB   )z:Return ``True`` if FlashInfer cutedsl module is available.zflashinfer.cute_dslNrD   r   r   r   r   has_flashinfer_cutedsl   s   rS   c                  C   rF   )z:Return `True` if FlashInfer TRTLLM fused MoE is available.F))r0   r2   )r0   r3   r0   r<   )r0   trtllm_mxint4_block_scale_moeTrR   r"   rL   rM   r   r   r   has_flashinfer_trtllm_fused_moe   s   rW   c                  C   rF   )z;Return `True` if FlashInfer CUTLASS fused MoE is available.F))r0   r4   )r   r7   )r   nvfp4_block_scale_interleaverT   TrV   rM   r   r   r    has_flashinfer_cutlass_fused_moe   rP   rY   c                  C   rF   )z=Return ``True`` if FlashInfer CUTLASS fused MoE is available.F))r5   r6   )r   r:   )r   &silu_and_scaled_nvfp4_experts_quantizeT)rS   r"   rL   rM   r   r   r   -has_flashinfer_cutedsl_grouped_gemm_nt_masked   s   r[   c               
   C   s~   t  rdS z tjtdd} | jdk}|rtd |W S td| j |W S  ty> } ztd| W Y d}~d	S d}~ww )
zReturn `True` if NVIDIA's artifactory is accessible.

    This checks connectivity to the kernel inference library artifactory
    which is required for downloading certain cubin kernels like TRTLLM FHMA.
    T   )timeout   z NVIDIA artifactory is accessiblez2NVIDIA artifactory returned failed status code: %dz+Failed to connect to NVIDIA artifactory: %sNF)	r   requestsgetr   status_coder   r   warning_once	Exception)response
accessibleer   r   r   has_nvidia_artifactory  s$   

rg   c                   C   s   t  rdS tdot S )z
    TRTLLM attention is supported if the platform is SM100,
    NVIDIA artifactory is accessible, and batch-invariant mode is not enabled.
    Fd   )r   r   is_device_capability_familyrg   r   r   r   r   supports_trtllm_attention  s   rj   c                  C   s   ddl m}  |  }|jjS )a,  
    This function should only be called during initialization stage when vllm config
    is set.
    Return `None` if --attention-config.use_trtllm_attention is not set,
    return `True` if TRTLLM attention is forced to be used,
    return `False` if TRTLLM attention is forced to be not used.
    r   )get_current_vllm_config)vllm.configrk   attention_configuse_trtllm_attention)rk   vllm_configr   r   r   force_use_trtllm_attention-  s   rp   num_qo_headsnum_kv_headsc                 C   s$   t  du rdS t }|o| | dkS )z=Check if the current configuration supports TRTLLM attention.Fr   )rp   rj   )rq   rr   
has_trtllmr   r   r   can_use_trtllm_attention;  s   
rt   
num_tokensmax_seq_lendcp_world_sizekv_cache_dtypeq_dtype
is_prefillforce_use_trtllm	has_sinkshas_specc                 C   s   |dur|sdS |dkrt d dS t s|rt d dS | | dkr.|r,t d dS |
r9|s9t d d	S |t krFt d
 d	S |	rOt d d	S |du rs|rb|dk}|r`t d |S |dkoi|dk}|rqt d |S t d d	S )z*Return `True` if TRTLLM attention is used.NF   zcTrtllm does not support returning LSE and as a result does not support DCP, reverting to FlashInferzkTRTLLM attention is not supported on this platform, but --attention-config.use_trtllm_attention is set to 1r   zTRTLLM attention is not supported for this combination of query and key heads, but --attention-config.use_trtllm_attention is set to 1z:Using TRTLLM attention (enabled for speculative decoding).Tz,Using TRTLLM attention (query is quantized).z6Using TRTLLM attention (required for attention sinks).autoz/Using TRTLLM prefill attention (auto-detected).   z.Using TRTLLM decode attention (auto-detected).zLUsing TRTLLM attention (--attention-config.use_trtllm_attention is set to 1))r   rb   rj   	info_oncer   	fp8_dtype)rq   rr   ru   rv   rw   rx   ry   rz   r{   r|   r}   
use_trtllmr   r   r   rn   C  sT   




rn   )direct_register_custom_opkk_nopek_pec                 C   s   ddl m} || || dS )ar  Custom op wrapper for flashinfer's concat_mla_k.

        This is an in-place operation that concatenates k_nope and k_pe into k.

        The kernel is optimized for DeepSeek V3 dimensions:
        - num_heads=128
        - nope_dim=128
        - rope_dim=64

        Key optimizations:
        - Warp-based processing with software pipelining
        - Vectorized memory access (int2 for nope, int for rope)
        - L2 prefetching for next row while processing current
        - Register reuse for rope values across all heads

        Args:
            k: Output tensor, shape [num_tokens, num_heads, nope_dim + rope_dim].
                Modified in-place.
            k_nope: The nope part of k, shape [num_tokens, num_heads, nope_dim].
            k_pe: The rope part of k (shared), shape [num_tokens, 1, rope_dim].
                  This is broadcast to all heads.
        r   )concat_mla_kN)flashinfer.concat_opsr   )r   r   r   r   r   r   r   _flashinfer_concat_mla_k  s   r   c                 C   s   d S r%   r   )r   r   r   r   r   r   _flashinfer_concat_mla_k_fake  s   r   flashinfer_concat_mla_k)op_nameop_funcmutates_args	fake_implzvllm::flashinfer_mm_fp4cuda)r   device_typesABA_scaleB_scaleg_scaledtypeuse_8x4_sf_layoutbackendc           	      C   s&   ddl m} || |||||d||d	S )Nr   )mm_fp4   )
block_sizer   r   )r   r   )	r   r   r   r   r   r   r   r   flashinfer_mm_fp4_r   r   r   flashinfer_mm_fp4  s   r   c                 C   s    t j| jd |jd || jdS Nr   r~   r   devicetorchemptyshaper   )r   r   r   r   r   r   r   r   r   r   r   flashinfer_mm_fp4_fake  s    r   zvllm::bmm_fp8c                 C   s    ddl m} || ||||d |S )Nr   )bmm_fp8)r   r   )r   r   r   r   r   r   bmm_fp8_r   r   r   r     s   r   c                 C   s(   t j| jd | jd |jd || jdS )Nr   r~      r   r   )r   r   r   r   r   r   r   r   r   bmm_fp8_fake  s   r   zvllm::flashinfer_nvfp4_quantizeaa_global_sfc                 C   s*   ddl m} ddl m} || ||jddS )Nr   )SfLayout)nvfp4_quantizeF)sfLayout
do_shuffle)r   r   r   
layout_8x4)r   r   r   nvfp4_quantize_r   r   r   flashinfer_nvfp4_quantize  s
   
r   c                 C   s^   | j \}}dd }||d}|d }||d}tj||d tj| jdtj||tj| jdfS )Nc                 S   s   | | d | | S )Nr~   r   )xyr   r   r   r@   +  s    z0flashinfer_nvfp4_quantize_fake.<locals>.<lambda>   r      r   r   )r   r   r   uint8r   )r   r   mnround_up	rounded_mscale_n	rounded_nr   r   r   flashinfer_nvfp4_quantize_fake#  s   


r   zvllm::mm_mxfp8cutlass	out_dtypec              	   C   s"   ddl m} || |||d ||dS )Nr   )mm_mxfp8)outr   r   )r   r   )r   r   r   r   r   r   	mm_mxfp8_r   r   r   r   5  s   r   c                 C   s    t j| jd |jd || jdS r   r   )r   r   r   r   r   r   r   r   r   mm_mxfp8_fakeN  s    r   bblock_scale_ablock_scale_bc                 C   sf   | j dkr
|j dksJ | jd |jd ksJ |j dkr'tdt|j t| | ||||dS )ai  MXFP8 MM helper - mirrors flashinfer_scaled_fp4_mm API.

    Takes non-transposed weights and handles transpose internally.

    CRITICAL: mm_mxfp8 CUTLASS kernel requires SWIZZLED 1D scales for optimal
    performance and accuracy. Both input and weight scales should be in
    swizzled format from FlashInfer's mxfp8_quantize(is_sf_swizzled_layout=True).
    r   r~   zBmm_mxfp8 expects 1D swizzled weight scales for CUTLASS; got shape=)r   )ndimr   
ValueErrortupler   t)r   r   r   r   r   r   r   r   r   flashinfer_mm_mxfp8]  s    
r   alphac              
   C   s   | j dkr
|j dksJ |j dkr|j dksJ | ddkr&|ddks(J | jd |jd ks4J |dv rD|tj}|tj}|dkrQ| jd dkrQdnd	}t| | || ||||d
S )Nr   r~   )r   cudnntrtllmr       TF)r   r   )r   strider   viewr   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   flashinfer_scaled_fp4_mm  s$   	 r   scale_ascale_bbiasc                 C   s  | j dkr
|j dksJ | jd |jd ksJ | dkr$| dks&J | jtjkr2|jtjks4J | jjdkr@|jjdksBJ |jtjkrN|jtjksPJ |jjdkr\|jjdks^J t	| 
d|
d|||d| jd |jd }|d ur|| }|S )Nr   r~   r   r   r   )r   r   numelr   r   float8_e4m3fnr   typefloat32r   	unsqueezer   )r   r   r   r   r   r   outputr   r   r   flashinfer_scaled_fp8_mm  s(   	r   c                 C   s
   t | |S r%   )r   )r   r   r   r   r   $flashinfer_quant_nvfp4_8x4_sf_layout  s   
r   flashinfer.gemmfp8_blockscale_gemm_sm90c                   C   s   t  otdottddS )z>Return `True` if FlashInfer block-scale FP8 GEMM is available.Z   r   r   )r   r   is_device_capabilityrL   r"   r   r   r   r   "has_flashinfer_fp8_blockscale_gemm  s
   r   c                   C   s   t jot S )z>Return `True` if FlashInfer block-scale FP8 GEMM is supported.)r   #VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFERr   r   r   r   r   +is_flashinfer_fp8_blockscale_gemm_supported  s   r   is_flashinfer_supportedoutput_dtypeinputweightc           	      C   sb   | sdS d}d}|j }|j }|tjko.|tjko.|tjko.|jd | dko.|jd | dk}|S )NF@      r   r~   )r   r   bfloat16r   r   )	r   r   r   r   
N_MULTIPLE
K_MULTIPLEweight_dtypeinput_dtypeshould_use_flashinferr   r   r   -should_use_flashinfer_for_blockscale_fp8_gemm  s    
r   )r   %flashinfer_trtllm_fp8_block_scale_moeflashinfer_cutlass_fused_moe)flashinfer_cutedsl_grouped_gemm_nt_maskedflashinfer_fp4_quantizer9   r:   rX   r<   r=   rR   rE   rO   rQ   rY   r[   r   rg   rj   rt   rn   r   r   r   flashinfer_fp8_blockscale_gemmr   r   )NFF)r   r%   )Z__doc__r>   r-   r   importlib.utilosr   collections.abcr   typingr   r   r_   r   	vllm.envsr   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   vllm.platformsr   __name__r   environr`   r   r.   boolr   r   r   strr"   r/   flashinfer_trtllm_bf16_moer   *flashinfer_trtllm_fp8_per_tensor_scale_moer   r   r   r8   r9   r:   rX   r<   r=   rA   __annotations__rE   rO   rQ   rR   rS   rW   rY   r[   rg   rj   rp   intrt   r   rn   vllm.utils.torch_utilsr   Tensorr   r   library	custom_opr   register_faker   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __all__r   r   r   r   <module>   s  
	






T

		
	
&
&

	
