o
    :/i                  	   @   s(  d dl mZmZ d dlmZmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlmZ e	r`d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dl m!Z! G dd de"eZ#G dd dZ$G dd deZ%G dd dZ&ede&dZ'eG dd dZ(edZ)G dd deZ*G dd deee) Z+G dd  d eZ,G d!d" d"eee' Z-G d#d$ d$e-e' ee' Z.G d%d& d&e-e' ee' Z/G d'd( d(e-e' ee' Z0d)e"d*e1fd+d,Z2d-e"d.e3e% d/e3e+e)  d*e3e% fd0d1Z4d-e"d.e3e% d2e5e"e
f d*e3e% fd3d4Z6dS )5    )ABCabstractmethod)	dataclassreplace)Enum)TYPE_CHECKINGAnyClassVarGenericProtocolTypeVarN)
deprecated)
VllmConfig)
CacheDType)ColumnParallelLinear)QuantKey)DeviceCapability)KVCacheLayoutType)AttentionSpecc                   @   &   e Zd ZdZdZ	 dZ	 dZ	 dZdS )AttentionTypezO
    Attention type.
    Use string to be compatible with `torch.compile`.
    decoderencoderencoder_onlyencoder_decoderN)__name__
__module____qualname____doc__DECODERENCODERENCODER_ONLYENCODER_DECODER r#   r#   f/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/v1/attention/backend.pyr      s    r   c                   @   s$   e Zd ZU eed< defddZdS )
MultipleOfbasec                 C   s
   || _ d S N)r&   )selfr&   r#   r#   r$   __init__*      
zMultipleOf.__init__N)r   r   r   int__annotations__r)   r#   r#   r#   r$   r%   '   s   
 r%   c                   @   s  e Zd ZU dZdZeed< ejej	gZ
eeej  ed< g dZeed  ed< dZeed	< ed
eeeB  fddZeed
efddZeed
ed fddZeedd Zee	dUdededededed
eedf fddZe	dUdedededed
ef
ddZe	dVded
eedf fd d!Zed
eeef fd"d#Zed
ee fd$d%Z eded
efd&d'Z!ed(ejd
efd)d*Z"ed+d,d
efd-d.Z#eded/B d
efd0d1Z$ed2ed
efd3d4Z%ed
efd5d6Z&ed
efd7d8Z'ed
efd9d:Z(ed
efd;d<Z)ed
efd=d>Z*ed
efd?d@Z+edAed
efdBdCZ,edDdEd
efdFdGZ-eded(ejd+d,ded/B dHedIedJedKdEd
ed/B fdLdMZ.eded(ejd+d,ded/B dHedIedJedNedOedKdEdAed
ee fdPdQZ/edWdSdTZ0d/S )XAttentionBackendz&Abstract class for attention backends.Faccept_output_buffersupported_dtypes)autofloat16bfloat16r   supported_kv_cache_dtypesT forward_includes_kv_cache_updatereturnc                   C   s
   t dgS )N   )r%   r#   r#   r#   r$    get_supported_kernel_block_sizes?      
z1AttentionBackend.get_supported_kernel_block_sizesc                   C      t r'   NotImplementedErrorr#   r#   r#   r$   get_nameC      zAttentionBackend.get_nameAttentionImplBasec                   C   r9   r'   r:   r#   r#   r#   r$   get_impl_clsH   r=   zAttentionBackend.get_impl_clsc                   C   r9   r'   r:   r#   r#   r#   r$   get_builder_clsM   r=   z AttentionBackend.get_builder_clsr0   
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 C   r9   r'   r:   )rA   rB   rC   rD   rE   r#   r#   r$   get_kv_cache_shapeR      	z#AttentionBackend.get_kv_cache_shapec                 C   s"   d}| j |||||d}||S )zhDiscover which tensor dim is the block index, since different
        backends lay out dims differently.i )rE   )rF   index)clsrB   rC   rD   rE   _Sshaper#   r#   r$   get_kv_cache_block_dim]   s   

z'AttentionBackend.get_kv_cache_block_diminclude_num_layers_dimensionc                 C   r9   )av  
        Get the physical (memory layout) ordering of the kv cache dimensions.
        e.g. if the KV cache shape is
        [2, num_blocks, block_size, num_heads, head_size],
        and get_kv_cache_stride_order returns (1, 3, 0, 2, 4) then the physical
        ordering of dimensions is
        [num_blocks, num_heads, 2, block_size, head_size].

        If this function is unimplemented / raises NotImplementedError,
        the physical layout of the KV cache will match the logical shape.

        Args:
            include_num_layers_dimension: if True, includes an additional
                num_layers dimension, which is assumed to be prepended
                to the logical KV cache shape.
                With the above example, a return value (2, 4, 0, 1, 3, 5)
                corresponds to
                [num_blocks, num_heads, num_layers, 2, block_size, head_size].

                If an additional dimension is NOT included in the returned
                tuple, the physical layout will not include a layers dimension.

        Returns:
            A tuple of ints which is a permutation of range(len(shape)).
        r:   )rM   r#   r#   r$   get_kv_cache_stride_orderq   s   z*AttentionBackend.get_kv_cache_stride_orderc                 C   s   | j | jfS r'   )r   r   rI   r#   r#   r$   full_cls_name   s   zAttentionBackend.full_cls_namec                 C   s   g S r'   r#   rO   r#   r#   r$   get_supported_head_sizes      z)AttentionBackend.get_supported_head_sizesc                 C   s   |   }| p
||v S r'   )rQ   )rI   rD   supported_head_sizesr#   r#   r$   supports_head_size   s   z#AttentionBackend.supports_head_sizedtypec                 C   s
   || j v S r'   )r/   )rI   rU   r#   r#   r$   supports_dtype   r8   zAttentionBackend.supports_dtypekv_cache_dtypezCacheDType | Nonec                 C   s   |d u rdS | j  p|| j v S NT)r3   )rI   rW   r#   r#   r$   supports_kv_cache_dtype   s
   z(AttentionBackend.supports_kv_cache_dtypeNc                 C   sL   |d u rdS |   }|sdS |D ]}t|tr|j}|| dkr# dS qdS )NTr   F)r7   
isinstancer%   r&   )rI   rB   supported_kernel_block_sizessupported_sizer#   r#   r$   supports_block_size   s   
z$AttentionBackend.supports_block_sizedefault_block_sizec                 C   s0   |   }|s|S | |r|S tdd |D S )Nc                 s   s$    | ]}t |tr|jn|V  qd S r'   )rZ   r%   r&   ).0sr#   r#   r$   	<genexpr>   s   " z<AttentionBackend.get_preferred_block_size.<locals>.<genexpr>)r7   r]   min)rI   r^   supported_sizesr#   r#   r$   get_preferred_block_size   s   
z)AttentionBackend.get_preferred_block_sizec                 C      dS NFr#   rO   r#   r#   r$   is_mla   rR   zAttentionBackend.is_mlac                 C   re   rf   r#   rO   r#   r#   r$   supports_sink   rR   zAttentionBackend.supports_sinkc                 C   re   rf   r#   rO   r#   r#   r$   supports_alibi_sqrt   rR   z$AttentionBackend.supports_alibi_sqrtc                 C   re   rf   r#   rO   r#   r#   r$   supports_mm_prefix   rR   z#AttentionBackend.supports_mm_prefixc                 C   re   rf   r#   rO   r#   r#   r$   	is_sparse   rR   zAttentionBackend.is_sparsec                 C   re   rf   r#   rO   r#   r#   r$   supports_per_head_quant_scales   rR   z/AttentionBackend.supports_per_head_quant_scales	attn_typec                 C   s
   |t jkS )zCheck if backend supports a given attention type.

        By default, only supports decoder attention.
        Backends should override this to support other attention types.
        )r   r   )rI   rm   r#   r#   r$   supports_attn_type   s   
z#AttentionBackend.supports_attn_type
capabilityr   c                 C   re   rX   r#   )rI   ro   r#   r#   r$   supports_compute_capability   rR   z,AttentionBackend.supports_compute_capabilityuse_mlahas_sink
use_sparsedevice_capabilityc	           	      C      d S r'   r#   )	rI   rD   rU   rW   rB   rq   rr   rs   rt   r#   r#   r$   supports_combination      z%AttentionBackend.supports_combinationuse_mm_prefixuse_per_head_quant_scalesc              
   C   s@  g }|  |s|d | |s|d | |s |d | |s*|d |r5|  s5|d ||  krH|rC|d n|d |rS|  sS|d ||  krf|ra|d	 n|d
 |	rq| 	 sq|d | 
|
s{|d | |s|d| d | ||||||||
}|d ur|| |S )Nzhead_size not supportedzdtype not supportedzkv_cache_dtype not supportedzblock_size not supportedz5partial multimodal token full attention not supportedzMLA not supportedznon-MLA not supportedzattention sinks not supportedzsparse not supportedznon-sparse not supportedz#per-head quant scales not supportedz compute capability not supportedzattention type z not supported)rT   appendrV   rY   r]   rj   rg   rh   rk   rl   rp   rn   rv   )rI   rD   rU   rW   rB   rq   rr   rs   rx   ry   rt   rm   invalid_reasonscombination_reasonr#   r#   r$   validate_configuration   sT   
















z'AttentionBackend.validate_configurationKVCacheLayoutType | Nonec                 C   ru   r'   r#   rO   r#   r#   r$   get_required_kv_cache_layout6  rR   z-AttentionBackend.get_required_kv_cache_layout)r0   F)r5   r~   )1r   r   r   r   r.   boolr,   torchr1   r2   r/   r	   listrU   r3   r4   staticmethodr+   r%   r7   r   strr<   typer?   r@   tuplerF   classmethodrL   rN   rP   rQ   rT   rV   rY   r]   rd   rg   rh   ri   rj   rk   rl   rn   rp   rv   r}   r   r#   r#   r#   r$   r-   .   s  
 
	

	
	
;r-   c                   @   s   e Zd ZdS )AttentionMetadataN)r   r   r   r#   r#   r#   r$   r   ;  s    r   T)boundc                   @   s  e Zd ZU dZejed< ejed< 	 ejed< 	 eed< 	 eed< 	 eed< 	 eed< 	 ejed	< ejed
< dZe	ed< dZ
ejdB ed< dZedB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed< 	 dZejdB ed< dZejdB ed< dZejdB ed< defddZdejfddZd)ddZeeddejfdd Zeed!dejfd"d#Zdejfd$d%Zded&edd fd'd(ZdS )*CommonAttentionMetadataz
    Per-batch attention metadata, shared across layers and backends.
    AttentionMetadataBuilder instances use it to construct per-layer metadata.

    For many of the tensors we keep both GPU and CPU versions.
    query_start_locquery_start_loc_cpuseq_lensnum_reqsnum_actual_tokensmax_query_lenmax_seq_lenblock_table_tensorslot_mappingTcausalNlogits_indices_paddednum_logits_indicesencoder_seq_lensencoder_seq_lens_cpudcp_local_seq_lensdcp_local_seq_lens_cpu_seq_lens_cpu_num_computed_tokens_cpu_num_computed_tokens_cacher5   c                 C   s   | j jd S )Nr   )r   rK   r(   r#   r#   r$   
batch_sizes  s   z"CommonAttentionMetadata.batch_sizec                 C   s   | j dd | j dd  S )zENaive because it assumes that query ends where the next query starts.r6   N)r   r   r#   r#   r$   naive_query_lensv  s   z(CommonAttentionMetadata.naive_query_lensc                 K   s   t | fi |S r'   )r   )r(   kwargsr#   r#   r$   r   z  s   zCommonAttentionMetadata.replacez
    Prefer using device seq_lens directly to avoid implicit H<>D sync.
    If a CPU copy is needed, use `seq_lens.cpu()` instead.
    Will be removed in a future release, please migrate as soon as possible.
    c                 C   s   | j d u r| jd| _ | j S )Ncpu)r   r   tor   r#   r#   r$   seq_lens_cpu}  s   
	z$CommonAttentionMetadata.seq_lens_cpua  
    Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full
    async scheduling. If a CPU copy is needed, it can be derived from 
    query_start_loc_cpu and seq_lens.
    Will be removed in a future release, please migrate as soon as possible.
    c                 C   s8   | j d u r| jdd  | jd d  }| j| | _ | j S )Nr6   r   )r   r   r   )r(   query_seq_lensr#   r#   r$   num_computed_tokens_cpu  s
   

z/CommonAttentionMetadata.num_computed_tokens_cpuc                 C   s8   | j du r| jdd | jdd  }| j| | _ | j S )z>Compute num_computed_tokens on device (seq_lens - query_lens).Nr6   r   )r   r   r   )r(   
query_lensr#   r#   r$   compute_num_computed_tokens  s   
z3CommonAttentionMetadata.compute_num_computed_tokensnum_actual_reqsc                    s   fdd}t di d| jd  d  d| jd  d  d| jd   d| jd ur2| jd   nd d| jd urA| jd   nFd d	 d
|d| jd| jd| jd   d| j	d | d| j
d| jd| jd|| jd|| jd|| jd|| jS d	 d
|d| jd| jd| jd   d| j	d | d| j
d| jd| jd|| jd|| jd|| jd|| jS )Nc                    s   | d ur
| d   S d S r'   r#   )xr   r#   r$   <lambda>  s    z2CommonAttentionMetadata.unpadded.<locals>.<lambda>r   r6   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r#   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r(   r   r   maybe_slice_reqsr#   r   r$   unpadded  s   











z CommonAttentionMetadata.unpadded)r5   r   )r   r   r   r   r   Tensorr,   r+   r   r   r   r   r   r   npndarrayr   r   r   r   r   r   r   r   propertyr   r   r   r   r   r#   r#   r#   r$   r   B  sb   
 





r   Mc                   @   r   )AttentionCGSupportzConstants for the cudagraph support of the attention backend
    Here we do not consider the cascade attention, as currently
    it is never cudagraph supported.      r6   r   N)r   r   r   r   ALWAYSUNIFORM_BATCHUNIFORM_SINGLE_TOKEN_DECODENEVERr#   r#   r#   r$   r     s    r   c                   @   sD  e Zd ZU ejZee ed< dZe	dB ed< dZ
eed< edddee d	d
dejfddZeded  d	d
dddefddZ			d0de	dB dededdfddZe	d1de	dededefddZdedejdejdefdd Zdedefd!d"Zded#e	defd$d%Zde	d&ejd'e	d(e	d)ed*ed+ed,e	d-e	defd.d/ZdS )2AttentionMetadataBuilder_cudagraph_supportNreorder_batch_thresholdFsupports_update_block_tablekv_cache_specr   layer_namesvllm_configr   devicec                 C   s   || _ || _|| _|| _d S r'   )r   r   r   r   )r(   r   r   r   r   r#   r#   r$   r)     s   
z!AttentionMetadataBuilder.__init__rI   r5   c                 C   s   | j S )z6Get the cudagraph support level of this builder class.)r   )rI   r   r   r#   r#   r$   get_cudagraph_support  s   z.AttentionMetadataBuilder.get_cudagraph_supportr6   supports_spec_as_decodesupports_dcp_with_varlenc                 C   sx   || _ | j d ur*|r*| jj}|d ur*|jd ur*d|jrdnd|j  }t| j || _ | jjjdkr8|s:d| _ d S d S d S )Nr6   r   )r   r   speculative_confignum_speculative_tokensparallel_draftingmaxparallel_configdecode_context_parallel_size)r(   r   r   r   r   max_num_queries_for_specr#   r#   r$   _init_reorder_batch_threshold  s(   

z6AttentionMetadataBuilder._init_reorder_batch_thresholdcommon_prefix_lencommon_attn_metadata
fast_buildc                 C   r9   )a  
        Central method that builds attention metadata.
        Some builders (MLA) require reorder_batch to be called prior to build.

        Args:
            common_prefix_len: The length of the common prefix of the batch.
            common_attn_metadata: The common attention metadata.
            fast_build: The meta-data will prioritize speed of building over
                then speed at execution. Can be used for spec-decode where the
                result of a build call may only be used for few layers/iters.
        r:   )r(   r   r   r   r#   r#   r$   build  s   zAttentionMetadataBuilder.buildmetadata	blk_tabler   c                 C   r9   )a  
        Update the block table for the attention metadata.
        Faster when theres multiple kv-cache groups that create virtually the
        same metadata but just with different block tables.

        Only needs to be implemented if supports_update_block_table is True.
        r:   )r(   r   r   r   r#   r#   r$   update_block_table+     z+AttentionMetadataBuilder.update_block_tablec                 C   s   | j d|dS )z
        Build attention metadata for CUDA graph capture. Uses build by default.
        Subclasses that override this method should call self.build or
        super().build_for_cudagraph_capture.
        r   )r   r   r   )r(   r   r#   r#   r$   build_for_cudagraph_capture:  s   z4AttentionMetadataBuilder.build_for_cudagraph_capturedraft_indexc                 C   s   | j d|ddS )a  
        Build attention metadata for draft model. Uses build by default.

        Args:
            common_attn_metadata: The common attention metadata.
            draft_index: The index of the current draft operation.
                When speculating a chain of tokens, this index refers to the
                draft attempt for the i-th token.
                For tree-based attention, this index instead refers to the
                draft attempt for the i-th level in the tree of tokens.
        r   T)r   r   r   r   )r(   r   r   r#   r#   r$   build_for_draftingF  s
   z+AttentionMetadataBuilder.build_for_draftingr   num_query_headsrC   	use_alibiuse_sliding_windowuse_local_attentionnum_smsdcp_world_sizec
           
      C   re   rf   r#   )
r(   r   r   r   rC   r   r   r   r   r   r#   r#   r$   use_cascade_attention\  rw   z.AttentionMetadataBuilder.use_cascade_attention)r6   FFr   ) r   r   r   r   r   r   r	   r,   r   r+   r   r   r   r   r   r   r   r)   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r#   r#   r#   r$   r     s   
 

 


	
r   c                   @   sz   e Zd ZU ejed< ejed< ejed< eed< eed< eed< ejed< dejd	ejd
ejdejdedejfddZdS )AttentionLayer_q_scale_k_scale_v_scale_q_scale_float_k_scale_float_v_scale_float_prob_scalequerykeyvaluekv_cacheattn_metadatar5   c                 C   ru   r'   r#   )r(   r   r   r   r   r   r#   r#   r$   forwardt  s   zAttentionLayer.forwardN)	r   r   r   r   r   r,   floatr   r   r#   r#   r#   r$   r   k  s*   
 



r   c                       s   e Zd ZU dZeed< eed< eed< dZeed< dZ	eed< dZ
eed< dZeed	< dZeed
< eed< eed< eed< eed< eed< eed<  fddZdejfddZ  ZS )r>   zBase class for attention implementations.

    Contains common attributes and initialization logic shared by both
    standard AttentionImpl and MLAAttentionImpl. Does not define a forward
    method - subclasses define their own forward interfaces.
    	num_headsrD   scaleFcan_return_lse_for_decodesupports_pcp0supports_mtp_with_cp_non_trivial_interleave_sizeneed_to_return_lse_for_decodesupports_quant_query_inputr   dcp_rankpcp_world_sizepcp_ranktotal_cp_world_sizetotal_cp_rankc                    s   t  | }zddlm} | j|_| j|_W n ty'   d|_d|_Y nw zddlm	} | j|_
| j|_W n tyI   d|_
d|_Y nw |j
|j |_|j|j |j |_|jdkob|j|_|S )Nr   )get_dcp_groupr6   )get_pcp_group)super__new__vllm.distributed.parallel_stater   
world_sizer   rank_in_groupr   AssertionErrorr   r   r   r   r   r   r   )rI   argsr   r(   r   r   	__class__r#   r$   r    s,   



zAttentionImplBase.__new__	act_dtypec                 C   ru   r'   r#   )r(   r
  r#   r#   r$   process_weights_after_loading  s   z/AttentionImplBase.process_weights_after_loading)r   r   r   r   r+   r,   r   r   r   r   r   r   r   r  r   rU   r  __classcell__r#   r#   r  r$   r>   ~  s$   
 
r>   c                   @   s  e Zd ZdZedddddejdfdededededB de	e dB d	edB d
e
dedB de
de
dB ddfddZe			d(dedejdejdejdejdedejdB dejdB dejdB dejfddZd)ddZd d! Zdedejdejdejd"ejd#ejd$edejd%ejfd&d'ZdS )*AttentionImplz6Standard attention implementation with forward method.Nr0   r   rD   r   rC   alibi_slopessliding_windowrW   logits_soft_caprm   kv_sharing_target_layer_namer5   c                 C   r9   r'   r:   )r(   r   rD   r   rC   r  r  rW   r  rm   r  r#   r#   r$   r)     s   zAttentionImpl.__init__layerr   r   r   r   r   outputoutput_scaleoutput_block_scalec
           
      C   r9   r'   r:   )
r(   r  r   r   r   r   r   r  r  r  r#   r#   r$   r     r   zAttentionImpl.forward	quant_keyr   c                 C   re   )ab  
        Does this attention implementation support fused output quantization.
        This is used by the AttnFusionPass to only fuse output quantization
        onto implementations that support it.

        :param quant_key: QuantKey object that describes the quantization op
        :return: is fusion supported for this type of quantization
        Fr#   )r(   r  r#   r#   r$   fused_output_quant_supported  rG   z*AttentionImpl.fused_output_quant_supportedc                 C   re   )z
        Does this attention implementation support RoPE+KVCache fusion.
        This is used by the RopeKVCacheFusionPass to only fuse the RoPE ops
        with the KV cache update for implementations that support it.
        Fr#   r   r#   r#   r$   fused_rope_kvcache_supported  s   z*AttentionImpl.fused_rope_kvcache_supported	positionscos_sin_cacheis_neoxlayer_slot_mappingc
           
      C   r9   )z
        If `fused_rope_kvcache_supported` returns True, this method will be called
        by torch.ops.vllm.fused_rope_and_unified_kv_cache_update
        to perform the inplace RoPE and KV cache update.
        r:   )
r(   r  r   r   r   r  r  r  r   r  r#   r#   r$   do_rope_and_kv_cache_update  s   z)AttentionImpl.do_rope_and_kv_cache_update)NNN)r  r   )r   r   r   r   r   r   r   r+   r   r   r   r)   r   r   r   r   r   r  r  r   r  r#   r#   r#   r$   r    s    
	
		

	
r  c                *   @   sD  e Zd ZdZe		d*dededededee dB dedB d	ed
edB dededB dedB dedededededdde	dB dedB ddf(ddZ
edejdejdejdejdedejd ejddfd!d"Zedejeejejf B dejded#edeejejdB f f
d$d%Zdejdejd&ejd'ejd	edejddfd(d)ZdS )+MLAAttentionImplzFMLA attention implementation with forward_mqa and forward_mha methods.Nr   rD   r   rC   r  r  rW   r  rm   r  q_lora_rankkv_lora_rankqk_nope_head_dimqk_rope_head_dimqk_head_dim
v_head_dim	kv_b_projr   indexerq_pad_num_headsr5   c                 C   r9   r'   r:   r(   r   rD   r   rC   r  r  rW   r  rm   r  r  r   r!  r"  r#  r$  r%  r&  r'  r#   r#   r$   r)        zMLAAttentionImpl.__init__qkv_c_normedk_pekv_c_and_k_pe_cacher   k_scaler  c                 C   r9   )zMHA-style prefill forward pass.r:   )r(   r*  r+  r,  r-  r   r.  r  r#   r#   r$   forward_mha2  rw   zMLAAttentionImpl.forward_mhar  c                 C   r9   zMQA-style decode forward pass.r:   r(   r*  r-  r   r  r#   r#   r$   forward_mqa@  rG   zMLAAttentionImpl.forward_mqar   r   c                 C   @   |  dkrd S ddlm} |j||d|| ||d d S Nr   )_custom_opsr6   )rW   r   numelvllmr5  concat_and_cache_mlasqueezeflattenr(   r+  r,  r   r   rW   r.  opsr#   r#   r$   do_kv_cache_updateK     	
z#MLAAttentionImpl.do_kv_cache_updateNN)r   r   r   r   r   r+   r   r   r   objectr)   r   r   r   r/  r   r   r2  r>  r#   r#   r#   r$   r    s    
	
	
r  c                *   @   s
  e Zd ZdZe		d'dededededee dB dedB d	ed
edB dededB dedB dedededededdde	dB dedB ddf(ddZ
edejeejejf B dejdededeejejdB f f
ddZd ejd!ejd"ejd#ejd	ed$ejddfd%d&ZdS )(SparseMLAAttentionImplzSparse MLA attention implementation with only forward_mqa method.

    Sparse MLA implementations only support decode (MQA-style) attention.
    They do not support prefill (MHA-style) attention.
    Nr   rD   r   rC   r  r  rW   r  rm   r  r  r   r!  r"  r#  r$  r%  r   r&  r'  r5   c                 C   r9   r'   r:   r(  r#   r#   r$   r)   i  r)  zSparseMLAAttentionImpl.__init__r*  r-  r   r  c                 C   r9   r0  r:   r1  r#   r#   r$   r2    rG   z"SparseMLAAttentionImpl.forward_mqar+  r,  r   r   r.  c                 C   r3  r4  r6  r<  r#   r#   r$   r>    r?  z)SparseMLAAttentionImpl.do_kv_cache_updater@  )r   r   r   r   r   r+   r   r   r   rA  r)   r   r   r   r   r   r2  r>  r#   r#   r#   r$   rB  b  s    
	

rB  rW   r5   c                 C   s
   |  dS )Nfp8)
startswith)rW   r#   r#   r$   is_quantized_kv_cache  r*   rE  name_prefixattention_backend_clsbuilder_clsc                    s$   | |j  }t||fd fddiS )zN
    Return a new subclass where `get_builder_cls` returns `builder_cls`.
    r@   c                      s    S r'   r#   r#   rH  r#   r$   r     s    z,subclass_attention_backend.<locals>.<lambda>r   r   )rF  rG  rH  namer#   rI  r$   subclass_attention_backend  s   
rL  	overridesc                 C   s   | |j  }t||f|S r'   rJ  )rF  rG  rM  rK  r#   r#   r$   )subclass_attention_backend_with_overrides  s   
rN  )7abcr   r   dataclassesr   r   enumr   typingr   r   r	   r
   r   r   numpyr   r   typing_extensionsr   vllm.configr   vllm.config.cacher   !vllm.model_executor.layers.linearr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platforms.interfacer    vllm.v1.attention.backends.utilsr   vllm.v1.kv_cache_interfacer   r   r   r%   r-   r   r   r   r   r   r   r   r>   r  r  rB  r   rE  r   rL  dictrN  r#   r#   r#   r$   <module>   sh      ~ NIMC


