o
    :/i3@                     @   s  d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 eeZer-dd Znzd d	lmZ W n eyD   d d
lmZ Y nw eejjdrhed	d)dejdejdejdejdB dejf
ddZeejjdred	d)dejdejdejdB dejdejdedejdB dejfddZdejdejdejdB dejdB dejdB d ed!edeejejf fd"d#ZdejdejdejdB dejdB dejdB d ed!edeejejf fd$d%Zd&aG d'd( d(Ze  dS )*    )TYPE_CHECKINGN)flash_attn_varlen_func)init_logger)current_platform)direct_register_custom_opc                    s    fddS )Nc                    s    S N )namefnr   Z/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/_xpu_ops.py<lambda>   s    zregister_fake.<locals>.<lambda>r   r
   r   r
   r   register_fake   s   r   )r   )impl_abstractfp8_gemm_w8a16z_xpu_C::fp8_gemm_w8a16inputq_weightweight_scalebiasreturnc                 C   s>   |  d| jd }|d}|d}tj||f| j| jdS Nr      dtypedeviceviewshapesizetorchemptyr   r   )r   r   r   r   input_2dMNr   r   r   _fp8_gemm_w8a16_fake   s   

r%   int4_gemm_w4a16z_xpu_C::int4_gemm_w4a16qzeros
group_size	group_idxc           
      C   s>   |  d| jd }|d}|d}	tj||	f| j| jdS r   r   )
r   r   r   r   r'   r(   r)   r"   r#   r$   r   r   r   _int4_gemm_w4a16_fake*   s   


r*   	positionsquerykeyoffsetscos_sin_cache
rotary_dimis_neox_stylec              	   C   s&   |d usJ t jj| ||||||S r   )r    ops_xpu_Cdeepseek_scaling_roper+   r,   r-   r.   r/   r0   r1   r   r   r   #_xpu_ops_deepseek_scaling_rope_impl:   s   	r6   c                 C   s   ||fS r   r   r5   r   r   r   #_xpu_ops_deepseek_scaling_rope_fakeI   s   	r7   Fc                )   @   s  e Zd Ze																		d?dejdejdejd	ejd
edededB dedejdB dejdB dejdB de	e dB dedB dejdB dejdB dedededB dejdB f&ddZ
eejddddddddddddfdejd	ejdB dejdB dejdB dedB d dfd!d"Zedejd#ejd$ejd%ed&edB d dfd'd(Zed#ejd)ejd*ejdejd+ejd dfd,d-Zed.ejd/ejd0ejd1ejd2ed3ed4ed5ed ejfd6d7Zed.ejd8ed9ejd1ejd2ed3ed:ed5ed ejfd;d<Zed@d=d>ZdS )Axpu_opsNF           r   qkvcu_seqlens_qmax_seqlen_qmax_seqlen_ksoftmax_scalecausaloutblock_tablealibi_slopeswindow_sizesoftcap	seqused_kcu_seqlens_k	dropout_p
fa_versionreturn_softmax_lses_auxc                 C   s   |d us|d usJ d|d u s|d u sJ d|	d u s$|d us$J d|	d us0|d us0J d|d u r?t j| j| j| jd}|d u rFd}nt|dksNJ |d |d	 f}|	d u rb| }| }t||  ||||||||||	|||d
S )Nz*cu_seqlens_k or seqused_k must be providedz>cu_seqlens_k and seqused_k cannot be provided at the same timez,when enable block_table, seqused_k is neededz4when block_table is disabled, cu_seqlens_k is neededr   r   r   r:   r   r   )rC   r;   r<   r=   r>   rI   rH   r?   r@   rA   rB   rD   rM   rF   rL   )r    r!   r   r   r   len
contiguousr   )r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   scheduler_metadatarK   	q_descale	k_descale	v_descale
num_splitsrL   rM   real_window_sizer   r   r   r   Z   sL   zxpu_ops.flash_attn_varlen_funcrN   cache_seqlenscu_seqlens_k_newcache_leftpad	page_sizer   c                 C   s   t d d S )NzFget_scheduler_metadata is not implemented for xpu_ops, returning None.)loggerwarning_once)
batch_sizer?   r@   num_heads_qnum_heads_kvheaddimrW   	qkv_dtype	headdim_vr>   rX   rY   rZ   max_seqlen_k_newrB   rF   has_softcaprU   pack_gqa	sm_marginr   r   r   get_scheduler_metadata   s   zxpu_ops.get_scheduler_metadatakv_cacheslot_mappingquant_block_size	scale_fmtc                 C   s   | j d }| d|} 		 		 	 ddtjdtdtdtjd B dtd	tjd B d
td B dttjtjf fdd}|| |d|dkd\}}|d|tj	}	|tj	dd}
tj
|	|
gdd} | }|d|j d d||  d S )Nr   绽|=Fxr(   epsr   column_major_scalesout_q	use_ue8m0r   c                 S   s  |d u rd}|d u rt  }| jd | dks$J d| jd  d| | ddks/J d|d u r;tj| |d}n
|j| jksCJ |}| j}|d | }	|d d |	|f }
| |
}tjt|ddd	}t	|tj
|| j| jd
}t|j}t|j}|| }|rtdtt|}n|}|d}|| }t|||}||}||| |r|	f|d d  }|jdgtt|d R  }| |}n| }|| fS )NFr   r   zLast dimension z! must be divisible by group_size r   z&Input tensor groups must be contiguous)r   )dimkeepdim)r   r   g       @)r   	fp8_dtyper   strider    
empty_liker   amaxabsmaximumtensorr   r   finfomaxminpowceillog2	unsqueezeclamptocopy_permuterangerO   rP   float)rm   r(   rn   r   ro   rp   rq   x_qoriginal_shape
num_groupsgroup_shape	x_groupedabs_maxFP8_MAXFP8_MIN	scale_rawscalesscales_expandedx_scaled	x_clampedx_quantizedscales_shapex_sr   r   r   group_quant_torch   sN   	


z<xpu_ops.indexer_k_quant_and_cache.<locals>.group_quant_torchue8m0)r(   ro   rq      rr   r   )rl   NFNN)r   r   r    Tensorintr   r   booltupleuint8catflattenindex_copy_)r<   rh   ri   rj   rk   head_dimr   k_fp8k_scalek_fp8_bytesscale_bytesr   r   r   indexer_k_quant_and_cache   sJ   

T
 z!xpu_ops.indexer_k_quant_and_cachedst_k	dst_scalecu_seq_lensc                 C   s8  | d}| d}| d}|  d}|d | d }	tj||jdd }
t||
d }t|d|d }|
||  }|| }|||f }|d | }| d}| d}|| }|||  }|dtj||jd }|| |dd< |d |	 }|| ||  }|dtj||jd }|| |dd< dS )a  
        Args:
            kv_cache: [num_blocks, block_size, cache_stride] - quantized KV cache
                    Layout per block: [k_values, scale_values]
                    - k_values: [block_size * head_dim]
                    - scale_values: [block_size * head_dim * 4 / quant_block_size]
            dst_k: [num_tokens, head_dim] - output tensor for K values
            dst_scale: [num_tokens, head_dim / quant_block_size * 4]
                - output tensor for scale values
            block_table: [batch_size, num_blocks] - block table for indexing
            cu_seq_lens: [batch_size + 1] - cumulative sequence lengths
        r   r   r   r   r   N)	r   r    aranger   searchsortedr   ru   r   r   )rh   r   r   rD   r   r]   
num_tokensr   cache_block_sizerj   token_indicesbatch_indicesinbatch_seq_indicesblock_indices_in_tablephysical_block_indicesinblock_offsetsblock_stridekv_cache_flatsrc_block_offsetssrc_k_offsets	k_indices
scale_sizesrc_scale_offsetsscale_indicesr   r   r   cp_gather_indexer_k_quant_cache2  s4   





z'xpu_ops.cp_gather_indexer_k_quant_cachelogitscu_seqlen_kscu_seqlen_keraw_topk_indicesnum_rowsstride0strdide1topk_tokensc                 C   s   t || jd }| j|ddd tj}	|	|d d d f 8 }	|	dk}
|	|| d d d f  dk }tj|	dtj|	jd}|
|@ }|		| d |	|d |	jd d |	jd f< d S )Nr   r   r   r   Fr   )
r}   r   topkr   r    int32	full_liker   r   masked_fill_)r   r   r   r   r   r   r   r   	real_topktopk_indicesmask_lomask_himaskr   r   r   top_k_per_row_prefilly  s   "zxpu_ops.top_k_per_row_prefillnext_nseq_lensstride1c                 C   s   | j }|d}	|	| }
tj| jd |dd|	| d}tj|
|d| }tj|
|d| }|| | | d}||k}| | td} | j	|ddd 
tj}d|||k< ||d |jd d |jd f< d S )Nr   r   r   r   z-infr   )r   r   r    r   r   r   expandmasked_fillr   r   r   r   )r   r   r   r   r   r   r   r   r   r]   padded_num_tokensr+   row_indicesnext_n_offsetindex_end_posr   r   r   r   r   top_k_per_row_decode  s    
"zxpu_ops.top_k_per_row_decodec                   C   s$   t stdtg ttjd da d S d S )Nxpu_ops_deepseek_scaling_rope)op_nameop_funcmutates_args	fake_impldispatch_keyT)_OPS_REGISTEREDr   r6   r7   r   r   r   r   r   r   register_ops_once  s   zxpu_ops.register_ops_once)NFNNNNr9   NNr9   Nr:   NNNr   FN)r   N)__name__
__module____qualname__staticmethodr    r   r   r   r   listr   bfloat16rg   strr   r   r   r   r   r   r   r   r   r8   Y   s8   	

K	
oF		#r8   r   )typingr   r    %vllm_xpu_kernels.flash_attn_interfacer   vllm.loggerr   vllm.platformsr   vllm.utils.torch_utilsr   r   r[   r   torch.libraryImportErrorr   hasattrr2   r3   r   r%   r   r*   r   r   r6   r7   r   r8   r   r   r   r   r   <module>   s   


  o