o
    :/i                     @   s  U d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ ddlZddlZddlmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZm Z  ddl!m"Z" ee#Z$G dd de%Z&e
G dd dZ'e
G dd dZ(e
G dd dZ)G dd dZ*G dd deZ+G dd dZ,i Z-e.e/e0d f e1d< G dd deeZ2G dd de+Z3G d d! d!e+Z4G d"d# d#e+Z5G d$d% d%e2Z6G d&d' d'e+Z7G d(d) d)e+Z8G d*d+ d+e+Z9G d,d- d-e+Z:G d.d/ d/e+Z;G d0d1 d1e2Z<G d2d3 d3e2Z=G d4d5 d5Z>G d6d7 d7Z?G d8d9 d9Z@G d:d; d;ZAd<ejBd=e.eCeDeE f fd>d?ZFd@eEdAe/fdBdCZGdHd@eEdDeDe/ dEeEfdFdGZHdS )Iz
Analytic flops/memory estimation module for transformer components,
to help derive MFU (Model Flops Utilization) stats for a running model.
    N)ABCabstractmethod)Iterable)asdict	dataclass)AnyProtocol)	BaseModelFieldValidationErrormodel_validator)Self)
VllmConfig)init_logger)STR_DTYPE_TO_TORCH_DTYPEget_dtype_sizeget_kv_cache_torch_dtype)SchedulerOutputc                   @   s   e Zd ZdZdS )InvalidComponentzt
    Custom exception to indicate that a certain ComponentMetric is not
    applicable to the given VllmConfig.
    N)__name__
__module____qualname____doc__ r   r   a/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/v1/metrics/perf.pyr   "   s    r   c                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZ	e
eef dB ed< dZe
eef dB ed< dZe
eef dB ed	< dZe
eef dB ed
< dS )DebugPerfStats        calc_durationr   num_prefill_requestsnum_decode_requestsNcontext_breakdownnum_flops_per_gpu_breakdown num_read_bytes_per_gpu_breakdown!num_write_bytes_per_gpu_breakdown)r   r   r   r   float__annotations__r   intr   r    dictstrr!   r"   r#   r   r   r   r   r   .   s   
 r   c                   @   sB   e Zd ZU dZeed< dZeed< dZeed< dZe	dB ed< dS )	PerfStatsr   num_flops_per_gpunum_read_bytes_per_gpunum_write_bytes_per_gpuNdebug_stats)
r   r   r   r*   r&   r%   r+   r,   r-   r   r   r   r   r   r)   :   s
   
 r)   c                	   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed< dZeed	< dZeed
< dedededdfddZdefddZdefddZdefddZededededd fddZdS )ExecutionContexta  
    Represents an execution context for a batch of requests.

    This class aggregates statistics across multiple requests in a batch,
    separately tracking prefill and decode phases.

    Example)
    - Batch with one full prefill (2048 tokens) and one decode (1 token, 8192 context):
      ctx = ExecutionContext()
      ctx.add(2048, 2048, is_prefill=True)
      ctx.add(1, 8192, is_prefill=False)
    r   r   prefill_num_tokensprefill_context_lenprefill_token_context_productr   decode_num_tokensdecode_context_lendecode_token_context_product
num_tokenscontext_len
is_prefillreturnNc                 C   s   |r"|  j d7  _ |  j|7  _|  j|7  _|  j|| 7  _dS |  jd7  _|  j|7  _|  j|7  _|  j|| 7  _dS )z8Add a single request's statistics to this batch context.   N)r   r/   r0   r1   r   r2   r3   r4   )selfr5   r6   r7   r   r   r   add]   s   zExecutionContext.addc                 C      | j | j S )z8Total number of tokens across all requests in the batch.)r/   r2   r:   r   r   r   total_num_tokensj      z!ExecutionContext.total_num_tokensc                 C   r<   )z<Total sum of (num_tokens * context_len) across all requests.)r1   r4   r=   r   r   r   total_token_context_productn   r?   z,ExecutionContext.total_token_context_productc                 C   r<   )zNumber of tokens that require logits computation (unembedding).

        For prefill, only the last token per request needs logits.
        For decode, all tokens need logits.
        )r   r2   r=   r   r   r   num_logits_tokensr   s   z"ExecutionContext.num_logits_tokensc                 C   s   |  }| ||| |S )zwCreate an ExecutionContext from a single request.

        This is a convenience method primarily for testing.
        )r;   )clsr5   r6   r7   ctxr   r   r   from_single_requestz   s   z$ExecutionContext.from_single_request)r   r   r   r   r   r&   r%   r/   r0   r1   r   r2   r3   r4   boolr;   r>   r@   rA   classmethodrD   r   r   r   r   r.   B   s0   
 r.   c                   @   sN   e Zd ZdZdedefddZdededdfdd	Zdeeef fd
dZ	dS )
ParsedArgsz
    Syntactic sugar so that Parsers can use dot notations
    to access/update the parsed arguments.

    e.g.)
        args = ParsedArgs()
        args.x = 3
        args.y = args.x + 1
    namer8   c                 C   s   t dt| j d| d)N'z' has no attribute ')AttributeErrortyper   )r:   rH   r   r   r   __getattr__   s   zParsedArgs.__getattr__valueNc                 C   s   t | || d S N)object__setattr__)r:   rH   rM   r   r   r   rP      s   zParsedArgs.__setattr__c                 C   s   t |  S rN   )varscopyr=   r   r   r   
model_dump      zParsedArgs.model_dump)
r   r   r   r   r(   r   rL   rP   r'   rS   r   r   r   r   rG      s
    
rG   c                   @   s"   e Zd ZdededefddZdS )Parserargsvllm_configr8   c                 C      dS )z
        Parse the vllm config and update the current ParsedArgs and pass it on.
        If the parser isn't applicable to the vllm_config, it will do nothing.
        Nr   r:   rV   rW   r   r   r   parse   s   zParser.parseN)r   r   r   rG   r   rZ   r   r   r   r   rU      s    rU   c                   @   sF   e Zd ZdZdeddfddZdeddfdd	Zd
edefddZ	dS )ParserChainz
    Applies chain of parser in a sequential order.
    Later parsers might overwrite results from previous parsers,
    so parsers should be chained in the appropriate order if they
    are not mutually exclusive.
    parsersr8   Nc                 G   s   t || _d S rN   )listr\   )r:   r\   r   r   r   __init__      zParserChain.__init__parserc                 C   s   | j | d S rN   )r\   append)r:   r`   r   r   r   
add_parser      zParserChain.add_parserrW   c                 C   s"   t  }| jD ]}|||}q|S rN   )rG   r\   rZ   )r:   rW   rV   r`   r   r   r   rZ      s   
zParserChain.parse)
r   r   r   r   rU   r^   rb   r   rG   rZ   r   r   r   r   r[      s
    r[   ComponentMetrics_COMPONENT_METRICS_REGISTRYc                
   @   s.  e Zd ZdZeedefddZeedefddZ	dd Z
ed	edefd
dZedeed   fddZe	ddededeeef fddZe	ddededeeef fddZe	ddededeeef fddZddededefddZddededefddZddededefddZdS )rd   a-  
    Each concrete ComponentMetrics class is associated with:
    - fields that are required for metric derivation
      (fields are specified/validated through pydantic model)
    - parser to parse VllmConfig into fields
    - metric methods that derive flops/bytes for a given execution context
    r8   c                 C      d S rN   r   rB   r   r   r   component_type      zComponentMetrics.component_typec                 C   rX   )a  
        Return a ParserChain that provides values for all required fields.
        The returned parser chain must populate ParsedArgs with values for every
        field defined on this ComponentMetrics class. Missing fields will cause
        a ValidationError when from_vllm_config() is called.
        See individual Parser docstrings for which args they provide, and field
        comments on ComponentMetrics subclasses for which parser provides each field.
        Nr   rg   r   r   r   
get_parser   s   zComponentMetrics.get_parserc                 C   s   | t |  < d S rN   )re   rh   rg   r   r   r   __init_subclass__   rc   z"ComponentMetrics.__init_subclass__rW   c              
   C   sV   |   }||}z| | W S  ty* } ztd|   d| |d}~ww )zj
        Instantiate this class from VllmConfig.
        Raises ValidationError if parsing fails.
        zInvalid z	 config: N)rj   rZ   model_validaterS   r   r   rh   )rB   rW   r`   parsed_argser   r   r   from_vllm_config   s   
z!ComponentMetrics.from_vllm_configc                 C   s   t t S rN   )iterre   valuesrg   r   r   r   registered_metrics   r?   z#ComponentMetrics.registered_metricsTrC   per_gpuc                 C   rf   rN   r   r:   rC   rs   r   r   r   get_num_flops_breakdown      z(ComponentMetrics.get_num_flops_breakdownc                 C   rf   rN   r   rt   r   r   r   get_read_bytes_breakdown   rv   z)ComponentMetrics.get_read_bytes_breakdownc                 C   rf   rN   r   rt   r   r   r   get_write_bytes_breakdown   rv   z*ComponentMetrics.get_write_bytes_breakdownc                 C      t | || S rN   )sumru   rq   rt   r   r   r   get_num_flops      zComponentMetrics.get_num_flopsc                 C   ry   rN   )rz   rw   rq   rt   r   r   r   get_read_bytes  r|   zComponentMetrics.get_read_bytesc                 C   ry   rN   )rz   rx   rq   rt   r   r   r   get_write_bytes  r|   z ComponentMetrics.get_write_bytesNT)r   r   r   r   rF   r   r(   rh   r[   rj   rk   r   r   ro   r   rK   rr   r.   rE   r'   r&   ru   rw   rx   r{   r}   r~   r   r   r   r   rd      sV    


c                   @   &   e Zd ZdZdededefddZdS )BaseConfigParserz
    Parses base model configuration.
    Provides: vocab_size, hidden_size, num_attention_heads, num_hidden_layers,
    weight_byte_size, activation_byte_size, dp_size, tp_size, pp_size, enable_ep
    rV   rW   r8   c                 C   s   |j }| |_| |_t|jd|_t|jd|_|j j	}t
|tj	r(|}nt
|tr6|tv r6t| }n	td| tj}t||_d|_|jj|_|jj|_|jj|_|jj|_|S )Nnum_attention_headsnum_hidden_layersz.Unknown model_dtype %s, defaulting to bfloat16   )model_configget_vocab_size
vocab_sizeget_hidden_sizehidden_sizeget_requiredhf_text_configr   r   dtype
isinstancetorchr(   r   loggerwarningbfloat16r   weight_byte_sizeactivation_byte_sizeparallel_configdata_parallel_sizedp_sizetensor_parallel_sizetp_sizepipeline_parallel_sizepp_sizeenable_expert_parallel	enable_ep)r:   rV   rW   r   model_dtypetorch_dtyper   r   r   rZ     s4   







zBaseConfigParser.parseNr   r   r   r   rG   r   rZ   r   r   r   r   r         r   c                   @   r   )BaseAttentionConfigParserzo
    Parses attention-specific configuration.
    Provides: num_key_value_heads, head_dim, cache_byte_size
    rV   rW   r8   c                 C   sB   |j }| |_| |_|j j}|jj}t||}t	||_
|S rN   )r   get_total_num_kv_headsnum_key_value_headsget_head_sizehead_dimr   cache_configcache_dtyper   r   cache_byte_size)r:   rV   rW   r   r   r   kv_cache_torch_dtyper   r   r   rZ   E  s   



zBaseAttentionConfigParser.parseNr   r   r   r   r   r   ?      r   c                   @   r   )!AttentionQuantizationConfigParserza
    Parses quantization configuration for attention layers.
    Overrides: weight_byte_size
    rV   rW   r8   c                 C   sB   |j }|d u r	|S | }|dv rd|_|S |dkrd|_|S tN)fp8
fbgemm_fp8r9   mxfp4g      ?quant_configget_namer   r   r:   rV   rW   cfgquant_methodr   r   r   rZ   Z  s   z'AttentionQuantizationConfigParser.parseNr   r   r   r   r   r   T  r   r   c                	   @   sd  e Zd ZU edddZeed< edddZeed< edddZeed< edddZ	eed< edddZ
eed< edddZeed	< edddZeed
< edddZeed< edddZeed< edddZeeB ed< edefddZedefddZ	ddededeeef fddZ	ddededeeef fddZ	ddededeeef fddZdS )AttentionMetrics.r   gtr   r   r   r   r   r   r   r   r   r   r8   c                 C   rX   )Nattnr   rg   r   r   r   rh     ri   zAttentionMetrics.component_typec                 C   s   t t t t S rN   )r[   r   r   r   rg   r   r   r   rj     s
   zAttentionMetrics.get_parserTrC   rs   c           
      C   s   | j | j| j| j| jf\}}}}}| }| }	|r0|| j }td|| j	 }td|| j	 }d| | |d|   | | d| |	 | | d| |	 | | d| | | | | dS )Nr9   r   )qkv_projattn_qkattn_avout_proj)
r   r   r   r   r   r>   r@   r   maxr   )
r:   rC   rs   LDqkvdTTCr   r   r   ru     s"   
z(AttentionMetrics.get_num_flops_breakdownc           
      C   sV  | j | j| j| j| jf\}}}}}| }|r,|| j }td|| j }td|| j }i }	|| | j	 | |	d< t
||d|   | | j | |	d< |jdkrf|j| d|j |  | | j	 | |	d< |jdkr|	dd|j| | | j	 | d|j | | | j |   |	d< || | | j	 | |	d< t
|| | | j | |	d< |	S )	Nr9   	qkv_inputr   
qkv_weightr   
attn_input	out_input
out_weight)r   r   r   r   r   r>   r   r   r   r   r&   r   r/   r0   r2   getr3   r   )
r:   rC   rs   r   r   r   r   r   r   
read_bytesr   r   r   rw     s@   
&


z)AttentionMetrics.get_read_bytes_breakdownc           	      C   s   | j | j| j| j| jf\}}}}}| }|r,|| j }td|| j }td|| j }||d|   | | j	 | d| | | | j
 | || | j	 | dS )z4Calculate write memory traffic for attention layers.r9   r   )
qkv_outputkv_cache
out_output)r   r   r   r   r   r>   r   r   r   r   r   )	r:   rC   rs   r   r   r   r   r   r   r   r   r   rx     s   
z*AttentionMetrics.get_write_bytes_breakdownNr   )r   r   r   r
   r   r&   r%   r   r   r   r   r   r   r   r   r   r$   rF   r(   rh   r[   rj   r.   rE   r'   ru   rw   rx   r   r   r   r   r   p  sN   
 



.
r   c                   @   r   )BaseFfnConfigParserz
    Parses FFN and MoE configuration.
    Provides: intermediate_size, num_experts, num_experts_per_tok,
    moe_intermediate_size, num_shared_experts, num_moe_layers
    rV   rW   r8   c                 C   s   |j j}t|dr|jd ur|j}t|d|jd |_|j  |_t	|ddgd|_
t	|ddgd|_t	|dd	gd|_|jdk}|rI|j|_|S d|_|S )
Ntext_configintermediate_size   num_experts_per_tokmoe_topkr   moe_intermediate_sizen_shared_expertsnum_shared_experts)r   	hf_confighasattrr   getattrr   r   get_num_expertsnum_expertsgetattr_from_listr   r   r   r   num_moe_layers)r:   rV   rW   r   is_moer   r   r   rZ     s&   



zBaseFfnConfigParser.parseNr   r   r   r   r   r     r   r   c                   @   r   )FfnParallelParserzW
    Parses FFN parallelism configuration.

    Provides: ffn_tp_size, ffn_ep_size
    rV   rW   r8   c                 C   s<   |j rd|j|j }}n	|j|j d}}||_||_|S )Nr9   )r   r   r   ffn_tp_sizeffn_ep_size)r:   rV   rW   r   r   r   r   r   rZ     s   zFfnParallelParser.parseNr   r   r   r   r   r     r   r   c                   @   r   )InterleaveMoeLayerStepParserzg
    Parses interleave_moe_layer_step field for models like Llama4.

    Overrides: num_moe_layers
    rV   rW   r8   c                    sX   |j j t dr jd ur j t dr* jdkr*t fddt|jD |_|S )Nr   interleave_moe_layer_stepr   c                    s"   g | ]}|d   j  dkr|qS )r9   r   )r   .0layerr   r   r   
<listcomp>;  s
    z6InterleaveMoeLayerStepParser.parse.<locals>.<listcomp>)	r   r   r   r   r   lenranger   r   rY   r   r   r   rZ   1  s   

z"InterleaveMoeLayerStepParser.parseNr   r   r   r   r   r   *  r   r   c                   @   r   )MoeLayerFreqParserzy
    Parses moe_layer_freq and first_k_dense_replace fields for models like Deepseek.

    Overrides: num_moe_layers
    rV   rW   r8   c                    sX   |j j t dr jd ur j t dr*t dr*t fddt|jD |_|S )Nr   moe_layer_freqfirst_k_dense_replacec                    s(   g | ]}| j kr| j d kr|qS r   )r   r   r   r   r   r   r   S  s    
z,MoeLayerFreqParser.parse.<locals>.<listcomp>)r   r   r   r   r   r   r   r   rY   r   r   r   rZ   L  s   
	zMoeLayerFreqParser.parseNr   r   r   r   r   r   E  r   r   c                   @   r   )FfnQuantizationConfigParserz\
    Parses quantization configuration for FFN layers.

    Overrides: weight_byte_size
    rV   rW   r8   c                 C   sD   |j }|d u r	|S | }|dv rd|_	 |S |dkr d|_|S tr   r   r   r   r   r   rZ   e  s   z!FfnQuantizationConfigParser.parseNr   r   r   r   r   r   ^  r   r   c                	   @   s  e Zd ZU edddZeed< edddZeed< edddZeed< edddZ	eed< edddZ
eed< edddZeed	< edddZeed
< edZeed< edZeed< edZeed< edZeed< edddZeed< edddZeeB ed< edddefddZedefddZedefddZ	d&dededeeef fdd Z	d&dededeeef fd!d"Z 	d&dededeeef fd#d$Z!d%S )'
FfnMetrics.r   r   r   r   r   r   r   r   r   r   r9   r   r   r   )ger   r   after)moder8   c                 C   sP   | j dkr&| jsJ d| j| jsJ d| j| js&J d| j| S )zJValidate that MoE-related fields are properly set when num_moe_layers > 0.r   zself.num_experts=zself.num_experts_per_tok=zself.moe_intermediate_size=)r   r   r   r   r=   r   r   r   validate_moe_fields  s
   
zFfnMetrics.validate_moe_fieldsc                 C   rX   )Nffnr   rg   r   r   r   rh     ri   zFfnMetrics.component_typec                 C   s   t t t t t t t S rN   )r[   r   r   r   r   r   r   rg   r   r   r   rj     s   zFfnMetrics.get_parserTrC   rs   c                 C   s  | j | j| j}}}| j| j| j| jf\}}}}	| }
|| }|r'|
| nd}|rJ|| j }|| j }|| j	 }|durC|| j	 }|rJ|| j
 }i }|r\d| d | |
 | |d< |rn|rnd| d | | | |d< |r|	rd| d | |	 |
 | |d< |S )z)Calculate flops breakdown for FFN layers.r   Nr      	dense_ffn
routed_ffn
shared_ffn)r   r   r   r   r   r   r   r>   r   r   r   )r:   rC   rs   r   r   DILmEMISr   Ldnum_activated_tokensflopsr   r   r   ru     s2   




 z"FfnMetrics.get_num_flops_breakdownc                 C   s  | j | j| j}}}| j| j| j| jf\}}}}	| }
| j}|| }|r*|
| nd}|rV|| j	 }|| j	 }|| j
 }|durF|| j
 }|rM|| j }|durV|| j }i }|rt|
| | j | |d< td| | | j | |d< td|
 | | j | |d< t|
| | j | |d< t|| | j | |d< |r@|rt||}t|| | j | |d	< td| | | | j | |d
< td| | | j | |d< t|| | j | |d< t|| | | j | |d< |	r@t|
| | j | |d< td| | |	 | j | |d< td|
 | |	 | j | |d< t|
| | j | |d< t|| |	 | j | |d< |S )z-Calculate read memory traffic for FFN layers.r   Ndense_up_gate_inputr   dense_up_gate_weightsdense_silu_inputdense_down_inputdense_down_weightsrouted_up_gate_inputrouted_up_gate_weightsrouted_silu_inputrouted_down_inputrouted_down_weightsshared_up_gate_inputshared_up_gate_weightsshared_silu_inputshared_down_inputshared_down_weights)r   r   r   r   r   r   r   r>   r   r   r   r   r&   r   r   min)r:   rC   rs   r   r   r   r   r   r   r   r   r   r  r  r   num_activated_expertsr   r   r   rw     s   






z#FfnMetrics.get_read_bytes_breakdownc                 C   s  | j | j| j}}}| j| j| j| jf\}}}}	| }
|| }|r'|
| nd}|rJ|| j }|| j }|| j	 }|durC|| j	 }|rJ|| j
 }i }|rwtd|
 | | j | |d< t|
| | j | |d< t|
| | j | |d< |r|rtd| | | j | |d< t|| | j | |d< t|| | j | |d	< |	rtd|
 |	 | | j | |d
< t|
|	 | | j | |d< t|
|	 | | j | |d< |S )z.Calculate write memory traffic for FFN layers.r   Nr   dense_up_gate_outputdense_silu_outputdense_down_outputrouted_up_gate_outputrouted_silu_outputrouted_down_outputshared_up_gate_outputshared_silu_outputshared_down_output)r   r   r   r   r   r   r   r>   r   r   r   r&   r   )r:   rC   rs   r   r   r   r   r   r   r   r   r  r  write_bytesr   r   r   rx   8  sd   




z$FfnMetrics.get_write_bytes_breakdownNr   )"r   r   r   r
   r   r&   r%   r   r   r   r   r   r   r   r   r   r   r   r   r$   r   r   r   rF   r(   rh   r[   rj   r.   rE   r'   ru   rw   rx   r   r   r   r   r   }  sX   
 	

,

[
r   c                	   @   s   e Zd ZU edddZeed< edddZeed< edddZeed< edddZ	eed< eed< e
d	efd
dZe
d	efddZ	ddeded	eeef fddZ	ddeded	eeef fddZ	ddeded	eeef fddZdS )UnembedMetrics.r   r   r   r   r   r   r   r8   c                 C   rX   )Nunembedr   rg   r   r   r   rh     ri   zUnembedMetrics.component_typec                 C   s
   t t S rN   )r[   r   rg   r   r   r   rj     s   zUnembedMetrics.get_parserTrC   rs   c                 C   s8   | j | j}}| }|r|| j }dd| | | iS )z0Calculate flops breakdown for unembedding layer.r   r   )r   r   rA   r   r:   rC   rs   r   Vr   r   r   r   ru     s   
z&UnembedMetrics.get_num_flops_breakdownc                 C   sB   | j | j}}| }|r|| j }|| | j || | j dS )z4Calculate read memory traffic for unembedding layer.)inputweight)r   r   rA   r   r   r   r!  r   r   r   rw     s   
z'UnembedMetrics.get_read_bytes_breakdownc                 C   s.   | j }| }|r|| j }d|| | j iS )z5Calculate write memory traffic for unembedding layer.output)r   rA   r   r   )r:   rC   rs   r"  r   r   r   r   rx     s   
z(UnembedMetrics.get_write_bytes_breakdownNr   )r   r   r   r
   r   r&   r%   r   r   r   rF   r(   rh   r[   rj   r.   rE   r'   ru   rw   rx   r   r   r   r   r  ~  sD   
 




r  c                	   @   s   e Zd ZdeddfddZdefddZdd	ed
edefddZ	dd	ed
edefddZ
dd	ed
edefddZ	dd	ed
edeeef fddZ	dd	ed
edeeef fddZ	dd	ed
edeeef fddZdedefddZdS )ModelMetricsrW   r8   Nc                 C   s   || _ g | _t D ]7}z||}| j| td| t	| W q
 t
yA } ztd| t	| W Y d}~q
d}~ww dS )z
        Parse vllm_config to instantiate metrics for each component.
        is_enabled() will return False if no component metrics could be instantiated.
        z,Instantiated ComponentMetrics [%s] with (%s)z Failed to instantiate %s from %sN)rW   metricsrd   rr   ro   ra   r   inforh   r(   r   debug)r:   rW   
metric_clsmetricrn   r   r   r   r^     s(   
zModelMetrics.__init__c                 C   s   t | jdkS Nr   )r   r'  r=   r   r   r   
is_enabled  r_   zModelMetrics.is_enabledTrC   rs   c                       t  fdd| jD S )Nc                 3       | ]	}|  V  qd S rN   )r{   r   r+  rC   rs   r   r   	<genexpr>      z-ModelMetrics.get_num_flops.<locals>.<genexpr>rz   r'  rt   r   r1  r   r{        zModelMetrics.get_num_flopsc                    r.  )Nc                 3   r/  rN   )r}   r0  r1  r   r   r2    r3  z.ModelMetrics.get_read_bytes.<locals>.<genexpr>r4  rt   r   r1  r   r}     r5  zModelMetrics.get_read_bytesc                    r.  )Nc                 3   r/  rN   )r~   r0  r1  r   r   r2    r3  z/ModelMetrics.get_write_bytes.<locals>.<genexpr>r4  rt   r   r1  r   r~     r5  zModelMetrics.get_write_bytesc                    H   i }| j D ]}|||}|   fdd| D }|| q|S )Nc                        i | ]\}}  d | |qS .r   r   keyval	componentr   r   
<dictcomp>       z8ModelMetrics.get_num_flops_breakdown.<locals>.<dictcomp>)r'  ru   rh   itemsupdater:   rC   rs   totalr+  	breakdownprefixedr   r=  r   ru        
z$ModelMetrics.get_num_flops_breakdownc                    r6  )Nc                    r7  r8  r   r:  r=  r   r   r?    r@  z9ModelMetrics.get_read_bytes_breakdown.<locals>.<dictcomp>)r'  rw   rh   rA  rB  rC  r   r=  r   rw     rG  z%ModelMetrics.get_read_bytes_breakdownc                    r6  )Nc                    r7  r8  r   r:  r=  r   r   r?    r@  z:ModelMetrics.get_write_bytes_breakdown.<locals>.<dictcomp>)r'  rx   rh   rA  rB  rC  r   r=  r   rx     rG  z&ModelMetrics.get_write_bytes_breakdownscheduler_outputc                 C   s$  t  }t }|jD ]}|j}|j|d}|dkrq
|j| }|j||dd q
|j	}t
|jD ]$\}	}|j|d}|dkrAq1|j|	 }
|
| }|dk}|||| q1| |d}| |d}| |d}tt| t| t| }tjrtt  | |j|jt|||||_|S )zV
        Calculate perf stats for the current step based on scheduled tokens.
        r   T)r7   r9   )time	monotonicr.   scheduled_new_reqsreq_idnum_scheduled_tokensr   num_computed_tokensr;   scheduled_cached_reqs	enumeratereq_idsru   rw   rx   r)   rz   rq   envsVLLM_DEBUG_MFU_METRICSr   r   r   r   r-   )r:   rH  t0rC   new_reqrL  r5   r6   cached_reqsirN  r7   num_flops_breakdownread_bytes_breakdownwrite_bytes_breakdown
perf_statsr   r   r   get_step_perf_stats_per_gpu  sJ   







z(ModelMetrics.get_step_perf_stats_per_gpur   )r   r   r   r   r^   rE   r-  r.   r&   r{   r}   r~   r'   r(   ru   rw   rx   r   r)   r\  r   r   r   r   r&    sF    





r&  c                   @   s@   e Zd Zdd Zdd Zdeddfdd	Zd
edefddZ	dS )PerfMetricsDebugLoggingc                 C   s   |    d S rN   )resetr=   r   r   r   r^   J  rT   z PerfMetricsDebugLogging.__init__c                 C   s4   d| _ d| _d| _d| _i | _i | _i | _i | _d S )Nr   r   )total_calc_durationtotal_num_prefill_requeststotal_num_decode_requeststotal_num_batchestotal_context_breakdown!total_num_flops_per_gpu_breakdown"total_read_bytes_per_gpu_breakdown#total_write_bytes_per_gpu_breakdownr=   r   r   r   r^  M  s   
zPerfMetricsDebugLogging.resetr-   r8   Nc                 C   s   |  j |j7  _ |  j|j7  _|  j|j7  _|  jd7  _t| j| j	| j
| jg|j|j|j|jgD ]\}}t|ts?J | D ]\}}||d| ||< qCq4d S )Nr9   r   )r_  r   r`  r   ra  r   rb  ziprc  rd  re  rf  r    r!   r"   r#   r   r'   rA  r   )r:   r-   dstsrcr;  r<  r   r   r   observeW  s*   zPerfMetricsDebugLogging.observe
log_prefix
delta_timec                 C   s   dd | j  D }dd | j D }dd | j D }td|tj| j| j	| j
| j||||dd| j| dd		d
d d S )Nc                 S   "   i | ]\}}||d  ddqS )   mB.1fTFr   r   kvr   r   r   r?  q      z/PerfMetricsDebugLogging.log.<locals>.<dictcomp>c                 S   rm      eAro  GBr   rq  r   r   r   r?  u  rt  c                 S   rm  ru  r   rq  r   r   r   r?  y  rt  z%sMFU details: %sro  sz.1%)	prefill_reqsdecode_reqsnum_batchesr    flops_breakdownnum_read_bytes_breakdownnum_write_bytes_breakdowndurationmfu_calc_overheadr   )indent)rd  rA  re  rf  r   r)  jsondumpsr`  ra  rb  rc  r_  )r:   log_fnrk  rl  rd  re  rf  r   r   r   logo  s4   
zPerfMetricsDebugLogging.log)
r   r   r   r^   r^  r   rj  r(   r$   r  r   r   r   r   r]  I  s
    
r]  c                   @   sN   e Zd ZdefddZdd Zdeddfd	d
Zej	dfde
ddfddZdS )PerfMetricsLoggingrW   c                 C   s0   || _ |jj| _d | _tjrt | _|   d S rN   )	rW   r   r   r   debug_loggingrR  rS  r]  r^  )r:   rW   r   r   r   r^     s   
zPerfMetricsLogging.__init__c                 C   s4   t  | _d| _d| _d| _| jr| j  d S d S r,  )rI  rJ  last_log_timetotal_num_flops_per_gputotal_read_bytes_per_gputotal_write_bytes_per_gpur  r^  r=   r   r   r   r^    s   
zPerfMetricsLogging.resetr[  r8   Nc                 C   sZ   |  j |j7  _ |  j|j7  _|  j|j7  _| jr+|jd us"J | j|j d S d S rN   )	r  r*   r  r+   r  r,   r  r-   rj  )r:   r[  r   r   r   rj    s   zPerfMetricsLogging.observe rk  c                 C   s   | j s| js| jsd S t }|| j }|dkrd}d}n| j | d }| j| j | d }|d||| | jr@| j||| |   d S )Nr   rn  rv  z"%sMFU: %.1f TF/s/GPU %.1f GB/s/GPU)	r  r  r  rI  rJ  r  r  r  r^  )r:   r  rk  nowrl  avg_tflops_per_gpuavg_gbps_per_gpur   r   r   r    s8   

zPerfMetricsLogging.log)r   r   r   r   r^   r^  r)   rj  r   r(  r(   r  r   r   r   r   r    s
    

	r  c                	   @   sP   e Zd ZdZejZdedee	 de
eee f fddZdded	efd
dZdS )PerfMetricsProma  Record performance metrics in Prometheus.

    Average TFLOPS (tera floating-point operations per second) can be
    calculated using a PromQL query:

      rate(vllm:estimated_flops_per_gpu_total[1m]) / 1e12

    Average memory bandwidth in GB/s can be calculated using:

      (rate(vllm:estimated_read_bytes_per_gpu_total[1m]) +
       rate(vllm:estimated_write_bytes_per_gpu_total[1m])) / 1e9
    rW   
labelnamesper_engine_labelvaluesc                 C   sX   | j dd|d}t||| _| j dd|d}t||| _| j dd|d}t||| _d S )Nz"vllm:estimated_flops_per_gpu_totalzaEstimated number of floating point operations per GPU (for Model Flops Utilization calculations).)rH   documentationr  z'vllm:estimated_read_bytes_per_gpu_totalz^Estimated number of bytes read from memory per GPU (for Model Flops Utilization calculations).z(vllm:estimated_write_bytes_per_gpu_totalz_Estimated number of bytes written to memory per GPU (for Model Flops Utilization calculations).)_counter_clsmake_per_enginecounter_flopscounter_read_bytescounter_write_bytes)r:   rW   r  r  r  r  r  r   r   r   r^     s,   
zPerfMetricsProm.__init__r   r[  
engine_idxc                 C   sP   |j s|js|jsd S | j| |j  | j| |j | j| |j d S rN   )r*   r+   r,   r  incr  r  )r:   r[  r  r   r   r   rj    s   zPerfMetricsProm.observeNr   )r   r   r   r   prometheus_clientCounterr  r   r]   r(   r'   r&   rO   r^   r)   rj  r   r   r   r   r    s    
(r  counterr  c                    s    fdd|  D S )z&Create a counter for each label value.c                    s   i | ]
\}}| j | qS r   )labels)r   idxlabelvaluesr  r   r   r?     s    
z#make_per_engine.<locals>.<dictcomp>)rA  )r  r  r   r  r   r    s   
r  objattrc                 C   s$   t | |std| dt| |S )zMGet an attr from an object, or throw a InvalidComponentError if it's not set.zMissing required attr z
 in config)r   r   r   )r  r  r   r   r   r   )  s   

r   attrsdefaultc                 C   s&   |D ]}t | |rt| |  S q|S )zdTry to get the first attr that exists in the object
    from a list of attrs. Otherwise return None.)r   r   )r  r  r  r  r   r   r   r   0  s
   
r   rN   )Ir   r  rI  abcr   r   collections.abcr   dataclassesr   r   typingr   r   r  r   pydanticr	   r
   r   r   typing_extensionsr   	vllm.envsrR  vllm.configr   vllm.loggerr   vllm.utils.torch_utilsr   r   r   vllm.v1.core.sched.outputr   r   r   	Exceptionr   r   r)   r.   rG   rU   r[   re   r'   r(   rK   r%   rd   r   r   r   r   r   r   r   r   r   r   r  r&  r]  r  r  r  r&   r]   rO   r  r   r   r   r   r   r   <module>   sn   D	K4 !  B 
KDD
 