o
    :/iE                     @   sP  d dl Z d dlmZmZ d dlmZmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ er@d dlmZmZmZ eG d	d
 d
ZG dd dZeG dd deZeG dd deZeG dd dZeG dd dZeG dd dZeG dd dZeG dd dZG dd dZG dd dZ G dd  d Z!dS )!    N)defaultdictdeque)	dataclassfield)TYPE_CHECKINGAny)CUDAGraphStat)	PerfStats)SpecDecodingStats)EngineCoreEventEngineCoreOutputFinishReasonc                   @   sH   e Zd ZU dZdZeed< 	 dZeed< 	 dZ	eed< 	 dZ
eed< dS )	BaseCacheStatszStores cache hit statistics.Fresetr   requestsquerieshitsN)__name__
__module____qualname____doc__r   bool__annotations__r   intr   r    r   r   b/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/v1/metrics/stats.pyr      s   
 r   c                       sf   e Zd ZdZddeddf fddZdefd	d
Zdd Ze	de
fddZe	defddZ  ZS )CachingMetricszMetrics for caching with a hit rate of the most recent N requests.
    Args:
        interval: The number of the most recent requests to aggregate.
            Defaults to 1000.
      max_recent_requestsreturnNc                    s<   t    || _d| _d| _d| _tttttf   | _	d S Nr   )
super__init__r   aggregated_requestsaggregated_query_totalaggregated_query_hitr   tupler   query_queue)selfr   	__class__r   r   r"   *   s   
zCachingMetrics.__init__statsc                 C   s   |j r|    |jdkrdS | j|j|j|jf |  j|j7  _|  j|j7  _|  j|j7  _t	| jdkrk| j| j
kro| j \}}}|  j|8  _|  j|8  _|  j|8  _t	| jdkrm| j| j
ks?dS dS dS dS )a  Observe the prefix caching for a set of requests.

        This function is called with information gathered when new requests
        are being scheduled and are looking for computed blocks.

        When there are more than `max_recent_requests` requests, the oldest set
        of requests are removed from the metrics.

        Args:
            stats: The prefix cache stats.
        r   N   )r   r   r'   appendr   r   r#   r$   r%   lenr   popleft)r(   r+   old_requestsold_queriesold_hitsr   r   r   observe6   s$   
zCachingMetrics.observec                 C   s    d| _ d| _d| _| j  dS )zReset the metrics.r   N)r#   r$   r%   r'   clearr(   r   r   r   r   ^   s   zCachingMetrics.resetc                 C   s
   | j dkS )z.Return true if no requests have been observed.r   )r#   r5   r   r   r   emptye   s   
zCachingMetrics.emptyc                 C   s   | j dkrdS | j| j  S )z/Calculate the hit rate for the past N requests.r           )r$   r%   r5   r   r   r   hit_ratej   s   
zCachingMetrics.hit_rate)r   )r   r   r   r   r   r"   r   r3   r   propertyr   r6   floatr8   __classcell__r   r   r)   r   r   #   s    (r   c                   @   sV   e Zd ZU dZdZeed< 	 dZeed< 	 dZeed< 	 dedede	d	d
fddZ
d
S )PrefixCacheStatsz
    Stores prefix cache hit statistics.
    - `reset`: Whether `reset_prefix_cache` was invoked.
    - `queries`: Refers to the number of tokens that were queried.
    r   preempted_requestspreempted_queriespreempted_hits
num_tokensnum_hits	preemptedr   Nc                 C   s`   |r|  j d7  _ |  j|7  _|  j|7  _dS |  jd7  _|  j|7  _|  j|7  _dS z-Aggregate request information into the stats.r,   N)r=   r>   r?   r   r   r   )r(   r@   rA   rB   r   r   r   record   s   zPrefixCacheStats.record)r   r   r   r   r=   r   r   r>   r?   r   rD   r   r   r   r   r<   r   s   
 r<   c                   @   s&   e Zd ZdZdededdfddZdS )MultiModalCacheStatsz
    Stores multi-modal cache hit statistics.
    - `reset`: Whether `reset_mm_cache` was invoked.
    - `queries`: Refers to the number of multi-modal data items
      that were queried.
    num_queriesrA   r   Nc                 C   s.   |  j d7  _ |  j|7  _|  j|7  _dS rC   )r   r   r   )r(   rF   rA   r   r   r   rD      s   zMultiModalCacheStats.record)r   r   r   r   r   rD   r   r   r   r   rE      s    rE   c                   @   s2   e Zd ZU dZeed< eed< eedf ed< dS )KVCacheEvictionEventz&Single KV cache block eviction sample.lifetime_secondsidle_seconds.reuse_gaps_secondsN)r   r   r   r   r:   r   r&   r   r   r   r   rG      s
   
 rG   c                   @   s  e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed	< eed
Zeed< dZedB ed< eed
Zee ed< dZedB ed< dZeeef dB ed< eed
Zeeef ed< eed
Zeeef ed< dZedB ed< dZedB ed< dS )SchedulerStatsz$Stats associated with the scheduler.r   num_running_reqsnum_waiting_reqsstep_countercurrent_waver7   kv_cache_usageencoder_cache_usage)default_factoryprefix_cache_statsNconnector_prefix_cache_statskv_cache_eviction_eventsspec_decoding_statskv_connector_statswaiting_lora_adaptersrunning_lora_adapterscudagraph_stats
perf_stats) r   r   r   r   rL   r   r   rM   rN   rO   rP   r:   rQ   r   r<   rS   rT   listrU   rG   rV   r
   rW   dictstrr   rX   rY   rZ   r   r[   r	   r   r   r   r   rK      s"   
 rK   c                   @   sr   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< dZeed< dZeed	< dZeed
< dZeed< dS )RequestStateStatsz3Stats that need to be tracked across delta updates.r   num_generation_tokensr7   arrival_time	queued_tsscheduled_tsfirst_token_tslast_token_tsfirst_token_latencyFis_corruptedN)r   r   r   r   r`   r   r   ra   r:   rb   rc   rd   re   rf   rg   r   r   r   r   r   r_      s   
 r_   c                   @   s   e Zd ZU dZded< dZeed< dZeed< dZ	eed< d	Z
ed	B ed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< d	S )FinishedRequestStatsz)Stats associated with a finished request.r   finish_reasonr7   e2e_latencyr   num_prompt_tokensr`   Nmax_tokens_paramqueued_timeprefill_timeinference_timedecode_timemean_time_per_output_tokenFrg   num_cached_tokens)r   r   r   r   r   rj   r:   rk   r   r`   rl   rm   rn   ro   rp   rq   rg   r   rr   r   r   r   r   rh      s   
 rh   c                   @   s   e Zd ZU dZdZeedf ed< dZe	ed< dZ
e	ed< dZe	ed< dZe	ed	< dZe	ed
< dZe	ed< de	de	de	ddfddZdede	fddZdS )PromptTokenStatsab  Breakdown of prompt tokens by source.

    Fields:
        computed: Tokens prefilled locally (actual compute work).
        local_cache_hit: Tokens from local prefix cache.
        external_kv_transfer: Tokens from external KV transfer.
        cached_tokens: Tokens skipped during prefill (from scheduler).
        recomputed_tokens: Cached tokens that were recomputed (see below).
        total: Total prompt tokens.

    Invariants:
        computed + local_cache_hit + external_kv_transfer - recomputed_tokens = total
        local_cache_hit + external_kv_transfer - recomputed_tokens = cached_tokens
    local_computelocal_cache_hitexternal_kv_transfer.ALL_SOURCESr   computedrv   rw   cached_tokensrecomputed_tokenstotalrr   num_external_computed_tokens
prompt_lenr   Nc                 C   sx   |d |krdnd}|  j || 7  _ |  j|7  _|  j|| | 7  _|  j|7  _|  j|7  _|  j|7  _dS )z#Update stats from a prefill output.r,   r   N)ry   rw   rv   rz   r{   r|   )r(   rr   r}   r~   
recomputedr   r   r   update_from_output  s   

z#PromptTokenStats.update_from_outputsourcec                 C   s0   | j | j| jd}||vrtd| || S )z Get token count by source label.rt   zUnknown source: )ry   rv   rw   
ValueError)r(   r   
source_mapr   r   r   get_by_source#  s   zPromptTokenStats.get_by_source)r   r   r   r   rx   r&   r^   r   ry   r   rv   rw   rz   r{   r|   r   r   r   r   r   r   rs      s&   
 
rs   c                   @   s   e Zd ZdZdd ZdefddZedefddZ	d	e
de
fd
dZddde
dedededddedB fddZdeded dededddedB fddZ	d%ddd ed!edB ded"ef
d#d$ZdS )&IterationStatsz8Stats associated with a single set of EngineCoreOutputs.c                 C   sF   t   | _d| _t | _d| _g | _g | _g | _g | _	g | _
d| _d S r    )timeiteration_timestampr`   rs   prompt_token_statsnum_preempted_reqsfinished_requestsmax_num_generation_tokens_itern_params_itertime_to_first_tokens_iterinter_token_latencies_iternum_corrupted_reqsr5   r   r   r   r"   2  s   

zIterationStats.__init__r   c                 C   s0   d dd t|  D }| jj d| dS )Nz, c                 s   s"    | ]\}}| d | V  qdS )=Nr   ).0kvr   r   r   	<genexpr>?  s     z*IterationStats.__repr__.<locals>.<genexpr>())joinvarsitemsr*   r   )r(   field_to_value_strr   r   r   __repr__>  s   zIterationStats.__repr__c                 C   s   | j jS )z1Total prompt tokens (for backward compatibility).)r   r|   r5   r   r   r   rk   B  s   z IterationStats.num_prompt_tokensstartc                 C   s
   | j | S )z=Calculate an interval relative to this iteration's timestamp.)r   )r(   r   r   r   r   _time_sinceG  s   
zIterationStats._time_sinceoutputr   engine_core_timestampis_prefillingr~   	req_statslora_statesLoRARequestStates	lora_nameNc                 C   s   t |j}|  j|7  _|r(| jj|j|j|d | |j}	| j	
|	 |	|_| j|7  _tjr=|js=|jdkr=d|_|jd urN| |j|j|||| |rT||_n||j }
| j
|
 ||_d S )N)rr   r}   r~   r   T)r.   new_token_idsr`   r   r   rr   r}   r   ra   r   r-   rf   envsVLLM_COMPUTE_NANS_IN_LOGITSrg   num_nans_in_logitseventsupdate_from_events
request_idrd   re   r   )r(   r   r   r   r~   r   r   r   num_new_generation_tokensrf   itlr   r   r   r   K  s@   






z!IterationStats.update_from_outputreq_idr   r   c           	      C   s   ddl m} |D ]<}|j|jkr|j|_||| q|j|jkr1|jdkr*|j|_|	|| q|j|j
krD|  jd7  _||| qd S )Nr   )EngineCoreEventTyper7   r,   )vllm.v1.enginer   typeQUEUED	timestamprb   request_waiting	SCHEDULEDrc   request_running	PREEMPTEDr   )	r(   r   r   r   r   r   r   r   eventr   r   r   r     s   

z!IterationStats.update_from_eventsr   ri   r   rk   rl   rr   c                 C   s   |  |j}|j|j }|j|j }|j|j }	|j|j }
|jd dkr,|	|jd  nd}t||||j||||
|	||j|d}| j	
| |jrR|  jd7  _d S d S )Nr,   r   )ri   rj   rk   r`   rl   rm   rn   ro   rp   rq   rg   rr   )r   ra   rc   rb   rd   re   r`   rh   rg   r   r-   r   )r(   ri   rk   rl   r   rr   rj   rm   rn   rp   ro   rq   finished_reqr   r   r   update_from_finished_request  s6   z+IterationStats.update_from_finished_request)r   )r   r   r   r   r"   r^   r   r9   r   rk   r:   r   r   r_   r   r\   r   r   r   r   r   r   r   /  s^    
7
r   c                   @   s@   e Zd ZdZdd ZdededefddZed	efd
dZ	dS )	LoRAStatsz9Tracks waiting and running request IDs for a single LoRA.c                 C   s   t  | _t  | _d S N)setwaitingrunningr5   r   r   r   r"     s   zLoRAStats.__init__r   r   r   c                 C   sN   |r|rJ |r| j | n| j | |r| j| d S | j| d S r   )r   adddiscardr   )r(   r   r   r   r   r   r   update  s   zLoRAStats.updater   c                 C   s   | j p| j S r   r   r   r5   r   r   r   r6     s   zLoRAStats.emptyN)
r   r   r   r   r"   r^   r   r   r9   r6   r   r   r   r   r     s    r   c                   @   s   e Zd ZdZddefddZdededB d	ed
efddZdededB fddZdededB fddZ	dededB fddZ
dedB fddZdS )r   z1A per-LoRA count of running and waiting requests.F	log_statsc                 C   s   || _ tt| _d S r   )r   r   r   r   )r(   r   r   r   r   r"     s   zLoRARequestStates.__init__r   r   Nr   r   c                 C   s@   | j r|d u r	d S | j| }|||| |jr| j|= d S d S r   )r   r   r   r6   )r(   r   r   r   r   
lora_statsr   r   r   _request_update  s   
z!LoRARequestStates._request_updatec                 C      | j ||ddd d S )NTFr   r   r(   r   r   r   r   r   r        z!LoRARequestStates.request_waitingc                 C   r   )NFTr   r   r   r   r   r   r     r   z!LoRARequestStates.request_runningc                 C   s   | j ||ddd d S )NFr   r   r   r   r   r   request_finished  r   z"LoRARequestStates.request_finishedscheduler_statsc                 C   sJ   | j r|d u r	d S | j D ]\}}t|j|j|< t|j|j|< qd S r   )r   r   r   r.   r   rX   r   rY   )r(   r   r   r+   r   r   r   update_scheduler_stats  s   z(LoRARequestStates.update_scheduler_stats)F)r   r   r   r   r   r"   r^   r   r   r   r   rK   r   r   r   r   r   r     s     
r   )"r   collectionsr   r   dataclassesr   r   typingr   r   	vllm.envsr   vllm.compilation.cuda_graphr   vllm.v1.metrics.perfr	   vllm.v1.spec_decode.metricsr
   r   r   r   r   r   r   r<   rE   rG   rK   r_   rh   rs   r   r   r   r   r   r   r   <module>   s>   O> !