o
    :/iw                 	   @   s`  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlm Z m!Z! d dl"m#Z# d d	l$m%Z% d dl&m'Z' d d
l(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC d dlDmEZEmFZFmGZGmHZHmIZI d dlJmKZK d dlLmMZM d dlNmOZO d dlPmQZQmRZRmSZSmTZTmUZUmVZV d dlWmXZXmYZY d dlZm[Z[ d dl\m]Z]m^Z^m_Z_m`Z`maZa d dlbmcZc d dldmeZe d dlfmgZgmhZh d dlimjZjmkZk d dllmmZmmnZn d dlompZp d dlqmrZrmsZs d dltmuZumvZv d dlwmxZx d dlymzZz d dl{m|Z| d dl}m~Z~ d dlmZ d d lmZ d d!lmZ d d"lmZ d d#lmZ erd d$lmZ d d%lmZ d d&lmZ d d'lmZ neZeZeZeZejeZed(Zee eB Zeed)< ee eB Zeed*< d+eegef d,eegef fd-d.Zd+eegef d,eegedB f fd/d0Zd1ed,eeeef B dB fd2d3Zd4ed5ed,e%e fd6d7Zd8ee d5ed,efd9d:Zd8ee d5ed,efd;d<Zd8ee d,eeef fd=d>Zd8ee d5ed,eeef fd?d@Zd4ed,efdAdBZd4ed,ee fdCdDZedEdF ejD pYejd   ZdGpYedHZejdIdJdKe,d,eeeeef f fdLdMZdKe,d,eeeeef f fdNdOZe	G dPdQ dQZe	G dRdS dSeZdTefdUdVZdWed,efdXdYZdWed,efdZd[ZdS )\    N)Callable)MISSING	dataclassfieldsis_dataclass)permutations)	UnionType)
TYPE_CHECKING	AnnotatedAnyLiteral	TypeAliasTypeVarUnioncastget_args
get_origin)TypeAdapterValidationError)	FieldInfo)TypeIs)AttentionConfigCacheConfigCompilationConfig
ConfigTypeDeviceConfigECTransferConfig
EPLBConfigKernelConfigKVEventsConfigKVTransferConfig
LoadConfig
LoRAConfigModelConfigMultiModalConfigObservabilityConfigOffloadConfigParallelConfigPoolerConfigPrefetchOffloadConfigProfilerConfigSchedulerConfigSpeculativeConfigStructuredOutputsConfigUVAOffloadConfig
VllmConfigWeightTransferConfigget_attr_docs)
CacheDTypeKVOffloadingBackendMambaCacheMode
MambaDTypePrefixCachingHashAlgo)Device)
MoEBackend)MaxLoRARanks)ConvertOptionHfOverridesLogprobsMode
ModelDTypeRunnerOptionTokenizerMode)MMCacheTypeMMEncoderTPMode)DetailedTraceModules)All2AllBackendDataParallelBackendDCPCommBackendDistributedExecutorBackendExpertPlacementStrategy)SchedulerPolicy)	get_field)OptimizationLevelPerformanceMode)init_loggersuppress_logging)CpuArchEnumcurrent_platformload_general_plugins)is_in_ray_actoris_ray_initialized)is_interleavedmaybe_override_with_speculators)is_gguf)get_model_path)is_cloud_storage)FlexibleArgumentParser)	GiB_bytes)get_ip)resolve_kv_cache_dtype_string)AttentionBackendEnum)LogitsProcessor)QuantizationMethods)LoadFormatsUsageContext)ExecutorTTypeHint	TypeHintTreturn_typereturnc                    s   dt dtf fdd}|S )Nvalrh   c              
      s>   z | W S  t y } ztd|  d  d|d }~ww )NzValue z cannot be converted to .)
ValueErrorargparseArgumentTypeError)ri   erg    b/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/engine/arg_utils.py_parse_type   s   
zparse_type.<locals>._parse_typestrrd   )rg   rr   rp   ro   rq   
parse_type   s   ru   c                    s   dt dtd B f fdd}|S )Nri   rh   c                    s    | dks| dkr
d S t  | S )N None)ru   ri   ro   rp   rq   _optional_type   s   z%optional_type.<locals>._optional_typers   )rg   ry   rp   ro   rq   optional_type   s   rz   ri   c                 C   s"   t d| s
t| S ttj| S )Nz(?s)^\s*{.*}\s*$)rematchrt   rz   jsonloadsrx   rp   rp   rq   union_dict_and_str   s   r   	type_hinttypec                 C   s   | |u p	t | |u S )z*Check if the type hint is a specific type.)r   )r   r   rp   rp   rq   is_type   s   r   
type_hintsc                    s   t  fdd| D S )z0Check if the type hints contain a specific type.c                 3       | ]}t | V  qd S Nr   ).0r   r   rp   rq   	<genexpr>       z contains_type.<locals>.<genexpr>)anyr   r   rp   r   rq   contains_type   s   r   c                    s   t  fdd| D dS )z*Get the specific type from the type hints.c                 3   s    | ]
}t | r|V  qd S r   r   r   thr   rp   rq   r      s    zget_type.<locals>.<genexpr>N)nextr   rp   r   rq   get_type   s   r   c                    st   t | t}t|}t|d  t fdd|D s)td| ddd |D  t| tr0dnd	}d
 |t|iS )zGet the `type` and `choices` from a `Literal` type hint in `type_hints`.

    If `type_hints` also contains `str`, we use `metavar` instead of `choices`.
    r   c                 3   r   r   )
isinstance)r   optionoption_typerp   rq   r      r   z$literal_to_kwargs.<locals>.<genexpr>z*All options must be of the same type. Got z with types c                 S   s   g | ]}t |qS rp   r   )r   crp   rp   rq   
<listcomp>   s    z%literal_to_kwargs.<locals>.<listcomp>metavarchoicesr   )	r   r   r   r   allrk   r   rt   sorted)r   r   optionskwargrp   r   rq   literal_to_kwargs   s   
r   c                    s   t | |}t|}|d  t fdd|D s J d| dt tthv r8tt v s6J d  dt  |tusAt|v rEddS t	|dS )	Nr   c                 3   s     | ]}|t ur| u V  qd S r   )Ellipsis)r   t	elem_typerp   rq   r      s    z'collection_to_kwargs.<locals>.<genexpr>z8All non-Ellipsis elements must be of the same type. Got rj   zTIf element can have multiple types, one must be 'str' (i.e. 'list[int | str]'). Got +)r   nargs)
r   r   r   r   r   r   rt   tupler   len)r   r   r   typesrp   r   rq   collection_to_kwargs   s$   

r   c                 C   s
   | j dkS )z*Check if the class is not a built-in type.builtins)
__module__)r   rp   rp   rq   is_not_builtin   s   
r   c                 C   sj   t  }t| }t| }|tu r|t|d  |S |tthv r.|D ]	}|t| q"|S ||  |S )z6Extract type hints from Annotated or Union type hints.r   )	setr   r   r
   updateget_type_hintsr   r   add)r   r   originargsargrp   rp   rq   r      s   
r   c                 c   s    | ]}d |v V  qdS )--helpNrp   )r   r   rp   rp   rq   r          r   mkdocszmkdocs/__main__.py   )maxsizeclsc              	   C   s  t rt| ni }i }t| D ]}t|j}dd |D }t|d }|jturP|j}t|t	rO|j
d u r8|j}n!t  |
 }W d    n1 sJw   Y  n	|j
turY|
 }|j}||d }	|	dd}	||	d||< d}
|d ur|fdtd	tfd
d}||| d< || d  d|
 7  < nt|trtj|| d< nt|tr|| t| nt|tr|| t|t nt|tr|| t|t nt|tr|| t|t nt|tr#|dkrt|| d< || d  dtj 7  < n|dv rt || d< || d  dt j 7  < npt|| d< nit|t!r0t!|| d< n\t|t"rMt|tsFt#dd |D rMt$|| d< n?t|t"rjt%t&j'|| d< || d  d|
 7  < n"t|tszt#dd |D rt|| d< nt(d| d| dt)|| dtu r|| t|| d h td |v rt|tst*|| d || d< || dr|| d +d q|S )Nc                 s   s    | ]	}t |r|V  qd S r   )r   r   rp   rp   rq   r      s    z"_compute_kwargs.<locals>.<genexpr>rv   %z%%)defaulthelpzFShould either be a valid JSON string or JSON keys passed individually.ri   rh   c              
   S   s:   zt || W S  ty } ztt||d }~ww r   )r   validate_jsonr   rl   rm   repr)ri   r   rn   rp   rp   rq   parse_dataclass!  s   z(_compute_kwargs.<locals>.parse_dataclassr   r   z

actionmax_model_len)max_num_batched_tokenskv_cache_memory_bytesc                 s       | ]}t |V  qd S r   r   r   rp   rp   rq   r   A  r   c                 s   r   r   r   r   rp   rp   rq   r   G  s    
zUnsupported type z for argument rj   r   rw   ),
NEEDS_HELPr1   r   r   r   r   r   r   r   r   default_factoryrM   namegetstripreplacert   r   r   boolrl   BooleanOptionalActionr   r   r   r   r   listr   inthuman_readable_int_or_auto__doc__human_readable_intfloatdictr   r   ru   r}   r~   rk   r   rz   append)r   cls_docskwargsfieldr   	generatordataclass_clsr   r   r   json_tipr   rp   rp   rq   _compute_kwargs   s   














r   c                 C   s   t t| S )a{  Return argparse kwargs for the given Config dataclass.

    If `--help` or `mkdocs` are not present in the command line command, the
    attribute documentation will not be included in the help output.

    The heavy computation is cached via functools.lru_cache, and a deep copy
    is returned so callers can mutate the dictionary without affecting the
    cached version.
    )copydeepcopyr   )r   rp   rp   rq   
get_kwargs\  s   
r   c                	   @   s  e Zd ZU dZejZeed< ejZe	ed< ej
Z
eed< ejZeee B dB ed< ejZedB ed< ejZedB ed< ejZeed	< ejZeed
< ejZe	ed< ejZe	ed< ejZeeB ed< ejZe	ed< ejZeed< ejZee dB ed< ejZedB ed< ejZeed< ejZeeB ed< ejZeed< ej Z e!ed< e"j#Z$e%ed< ej&Z&e'ed< ej(Z(e'ed< e)j*Z*ee' dB ed< e+e)dZ,e'dB ed< e-j.Z.ee/B e0e1 B dB ed< e-j2Z2e'ed< e-j3Z3eed< e-j4Z4e'ed< e-j5Z5e'ed< e-j6Z6e'ed < e-j7Z7e'dB ed!< e-j8Z8e'ed"< e-j9Z9e'ed#< e-j:Z:e'ed$< e-j;Z;e<ed%< e-j=Z=e'ed&< e-j>Z>e'ed'< e-j?Z?e'ed(< dZ@e'dB ed)< dZAe'dB ed*< dZBe'dB ed+< dZCedB ed,< dZDe'dB ed-< d.ZEe	ed/< d.ZFe	ed0< e-jGZGeHed1< e-jIZIe	ed2< e-jJZJe	ed3< eKjLZLeMed4< e-jNZNeOed5< e-jPZPe	ed6< e-jQZQe	ed7< e-jRZRe'ed8< e-jSZSe'ed9< e-jTZTe'ed:< e-jUZUe	dB ed;< e+e-d<ZVeWed<< e-jXZXe	ed=< e-jYZYeZed>< e-j[Z[e'ed?< e-j\Z\e'ed@< e-j]Z]e'dB edA< dZ^e'dB edB< dZ_e	dB edC< e"j`Z`eaedD< ejbZbe	edE< ejcZce	edF< edjeZeeedG< efjgZgehedH< e+efdIZieje edI< ekjlZle'edJ< ekjmZme'edK< ekjnZne'edL< e+ekdMZoeje edM< e"jpZpehedN< e"jqZqe'dB edO< dZre'dB edP< esjtZte'edQ< esjuZue'edR< esjvZve'edS< dZwe'dB edT< ejxZxe'edU< ejyZyezedV< d.Z{e	edW< d.Z|e	edX< ej}Z}edB edY< ej~Z~edB edZ< ejZe	eB dB ed[< e+ed\Zeed\< ejZedB ed]< ejZeeB dB ed^< ejZe	ed_< ejZe	ed`< e-jZe	eda< ejZe	edb< e+edcZeee'eee'f B f edd< ejZe	ede< ejZe	edf< e+edgZeeeeef f edg< ejZeeef dB edh< ejZehedi< ejZedB edj< ejZe'edk< ejZe	edl< ejZeedm< ejZeeB dB edn< dZedB edo< ejZe	edp< ejZehdB edq< d.Ze	edr< ejZe'eds< ejZeedt< ejZeeef dB edu< ejZe	edv< ejZe'dB edw< ejZeej B dB edx< ejZe	edy< ejZe	edz< e-jZe	ed{< e"jZe'dB ed|< e+ed}Zeed}< e+ed~Zeee B ed~< dZe	dB ed< esjZe	ed< esjZe	dB ed< e+edZeed< ejZeed< dZedB ed< dZeeef dB ed< ejZedB ed< ejZedB ed< ejZee dB ed< ejZe	ed< e+edZehed< ejZe	ed< ejZe	ed< ejZe	ed< ejZe	ed< ejZe	ed< esjZeed< esjZee0e B dB ed< ejZedB ed< e+edZe)ed< e+edZeed< e+edZeKed< e+eKdZe	ed< e-jZeed< e-jZeed< e+edZeed< dZedB ed< dZedB ed< dZedB ed< ejZeed< ejZe	ed< e+edZeeef ed< ejZeed< ejZedB ed< ejZedB ed< e"jZe	ed< e"jZeed< e"jZeed< e+e"dZe'dB ed< e"jZeed< e+edZeeef ed< ejZe	ed< ejZeeeef B ed< ejZeee0e B  dB ed< 	 esjZe	dB ed< esjZe'ed< e"jZe	ed< ejZeed< ejZeed< e"jZehdB ed< e"jZeed< d.Ze	ed< dZe'ed< e+edZedB ed< d.Ze	ed< dZed dB ed< dd ZededefddZe dejfddZdefddĄZddƄ ZdefddȄZdede-dedB fdd̄Z		.dd͐e	dB de	defddЄZ
dd҄ Ze de'deee	dB e'f ee	dB e'f f fddՄZdeddfdd؄Zd͐e	dB defddڄZdS )
EngineArgszArguments for vLLM engine.modelenable_return_routed_expertsmodel_weightsNserved_model_name	tokenizerhf_config_pathrunnerconvertskip_tokenizer_initenable_prompt_embedstokenizer_modetrust_remote_codeallowed_local_media_pathallowed_media_domainsdownload_dirsafetensors_load_strategyload_formatconfig_formatdtypekv_cache_dtypeseedr   cudagraph_capture_sizesmax_cudagraph_capture_sizedistributed_executor_backendpipeline_parallel_sizemaster_addrmaster_portnnodes	node_rankdistributed_timeout_secondstensor_parallel_sizeprefill_context_parallel_sizedecode_context_parallel_sizedcp_comm_backenddcp_kv_cache_interleave_sizecp_kv_cache_interleave_sizedata_parallel_sizedata_parallel_rankdata_parallel_start_rankdata_parallel_size_localdata_parallel_addressdata_parallel_rpc_portFdata_parallel_hybrid_lbdata_parallel_external_lbdata_parallel_backendenable_expert_parallelenable_ep_weight_filtermoe_backendall2all_backendenable_elastic_ep
enable_dboubatch_sizedbo_decode_token_thresholddbo_prefill_token_threshold#disable_nccl_for_dp_synchronizationeplb_configenable_eplbexpert_placement_strategy_api_process_count_api_process_rankmax_parallel_loading_workers
block_sizeenable_prefix_cachingprefix_caching_hash_algodisable_sliding_windowdisable_cascade_attnoffload_backendcpu_offload_gbcpu_offload_paramsoffload_group_sizeoffload_num_in_groupoffload_prefetch_stepoffload_paramsgpu_memory_utilizationr   r   max_num_partial_prefillsmax_long_partial_prefillslong_prefill_token_thresholdmax_num_seqsmax_logprobslogprobs_modedisable_log_statsaggregate_engine_loggingrevisioncode_revisionhf_tokenhf_overridestokenizer_revisionquantizationallow_deprecated_quantizationenforce_eagerdisable_custom_all_reducelanguage_model_onlylimit_per_promptlimit_mm_per_promptenable_mm_embedsinterleave_mm_stringsmedia_io_kwargsmm_processor_kwargsmm_processor_cache_gbmm_processor_cache_typemm_shm_cache_max_object_size_mbmm_encoder_onlymm_encoder_tp_modemm_encoder_attn_backendio_processor_pluginskip_mm_profilingvideo_pruning_rateenable_lora	max_lorasmax_lora_rankdefault_mm_lorasfully_sharded_lorasmax_cpu_loras
lora_dtypeenable_tower_connector_loraspecialize_active_loraray_workers_use_nsightnum_gpu_blocks_overridemodel_loader_extra_configignore_patternsenable_chunked_prefilldisable_chunked_mm_inputdisable_hybrid_kv_cache_managerstructured_outputs_configreasoning_parserreasoning_parser_pluginspeculative_configshow_hidden_metrics_for_versionotlp_traces_endpointcollect_detailed_traceskv_cache_metricskv_cache_metrics_samplecudagraph_metricsenable_layerwise_nvtx_tracingenable_mfu_metrics enable_logging_iteration_detailsenable_mm_processor_statsscheduling_policyscheduler_clspooler_configcompilation_configattention_configkernel_configenable_flashinfer_autotune
worker_clsworker_extension_clsprofiler_configkv_transfer_configkv_events_configec_transfer_configgeneration_configenable_sleep_modeoverride_generation_config
model_imploverride_attention_dtypeattention_backendcalculate_kv_scalesmamba_cache_dtypemamba_ssm_cache_dtypemamba_block_sizemamba_cache_modeadditional_configuse_tqdm_on_loadpt_load_map_locationlogits_processorsasync_schedulingstream_intervalkv_sharing_fast_prefilloptimization_levelperformance_modekv_offloading_sizekv_offloading_backendtokens_onlyr   shutdown_timeoutweight_transfer_configfail_on_environ_validation)
flashinfertritongdn_prefill_backendc                 C   s*  t | jtrtdi | j| _t | jtrtdi | j| _t | jtr-tdi | j| _t | jtr<t	di | j| _t | j
trKtdi | j
| _
ddlm} |  tjjr| j}t| j| j| _|| jurptd|| j | jd ur| j}t| j| j| _|| jurtd|| j d S d S d S d S )Nr   rP   z@HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]zHHF_HUB_OFFLINE is True, replace tokenizer_id [%s] to tokenizer_path [%s]rp   )r   rl  r   r   rm  r   rn  r   r  r   r  r0   vllm.pluginsrQ   huggingface_hub	constantsHF_HUB_OFFLINEr   rW   r2  loggerinfor   r6  )selfrQ   model_idtokenizer_idrp   rp   rq   __post_init__l  sF   


	zEngineArgs.__post_init__parserrh   c           "      C   s<  t t}| jdtjd}dtjdd v rdtjdd v s)|jdi |d  |jdi |d
  |jdi |d  |jdi |d  |jdi |d  |j	di |d  |jdi |d  |jdi |d  |jdi |d  |j	di |d  |j	di |d  |jdi |d  |jdi |d   |j	!di |d"  |jdi |d$  |jdi |d'  |j	(di |d)  |jdi |d+  |j	,di |d-  |jdi |d/  |jdi |d1  |j	2di |d3  |j	4di |d5  |j	6di |d7  |j	8di |d9  |j	:di |d;  |jdi |d=  |jd>td?d@|dA dB |dA dC dD |jdi |dF  |jdi |dH  |j	Idi |dJ  |j	Kdi |dL  |j	Mdi |dN  |jdi |dP  |j	Qdi |dR  |j	Sdi |dT  |j	Udi |dV  t t}| jdWtjd}|jdi |dY  |jdi |d[  |j	\di |d]  |j	^di |d_  |jdi |da  |jdi |dc  |j	ddi |de  t t	}| jdft	jd}|j	gdi |dh  t t
}| jdit
jd}|j	jdi |dk  |j	ldi |dm  t t}	| jdntjd}
|
j	odi |	dp  |
j	q	rdi |	ds  |
jdi |	du  |
jdi |	dw  |
jdi |	dz  |
jdi |	d}  |
j	~di |	d  |
j	di |	d  |
j		di |	d  |
j	di |	d  |
j	di |	d  |
j	di |	d  |
j		di |	d  |
j	di |	d  |
jddtdd |
jddtdd |
jddtdd |
jddtdd |
jddtdd |
jddtddd |
j		di |	d  |
j		di |	d  |
j		di |	d  |
j	di |	d  |
j	di |	d  |
jdi |	d  |
j	di |	d  |
j	di |	d  |
j	di |	d  |
j	di |	d  |
j	di |	d  |
jdi |	d  |
jdi |	d  |
j	Đdi |	d  |
j	Ɛdi |	d  |
j	Ȑdi |	d  |
j	ʐdi |	d  |
jdi |	d  |
j	ΐdi |	d  t t}| jdtjd}|jdi |d  |j	Ӑdi |d  |j	Րdi |d  |jdi |d  |j	ِdi |d  |j	ېdi i |d dBdi |j	ݐdi |d  |j	ߐdi |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  t t}t t}t t}| jdtjd}|j	di |d  |jdi |d  |j	d i |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  t t}| jdtjd}|j	di |d   |j	di |d  |j	di |d  |j	di |d  |j	d	i |d  |j		d
i |d
  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  t t}| jdtjd}|jdtjdd |jdi |d   |jdi |d"  |j	#di |d$  |j	%di |d&  |jdi |d(  |j	)di |d*  |jdi |d,  |j	-di |d.  t t}| jd/tjd}|j	0di |d1  |j	2di |d3  |d4 d5 }d6d7| d8}||d4 d9< |d4 d5  d:d; tttd<d=D 7  < |j	>di |d4  |j	?di |d@  |j	Adi |dB  |j	Cd i |dD  |j	Ed!i |dF  |j	Gd"i |dH  |j	Id#i |dJ  t t}| jdKtjd}|j	Ld$i i |dM dBdi |j	Nd%i i |dO dBdi |j	Pd&i |dQ  |j	Rd'i |dS  |j	Td(i |dU  |j	Vd)i |dW  |j	Xd*i i |dY dBdi |j	Zd+i |d[  |j	\d,i |d]  |j	^d-i |d_  |j	`d.i |da  |j	bd/i |dc  t t}| jddtjd}|j	ed0i |df  |j	gd1i |dh  t t}| jditjd}|j	jd2i |dk  |dl }dmdn |do< |jd3i | t t} | jdqtjd}!ttj | dr do< |!j	sd4i | dr  |!j	td5i | du  |!jd6i | dw  |!j	xd7i | dy  |!j	{d8i | d|  |!j	~d9i | d  |!jd:i | d  |!j	d;i | d  |!j	d<i | d  |!jd=i | d  |!j	d>i | d  |!jd?i | d  |!j	d@i | d  | jdddd | jdddd | jdddtjd | jdtddd | jddddgddd | S (A  z%Shared CLI arguments for vLLM engine.r#   )titledescriptionserve   Nr   --modelr   --runnerr   	--convertr   --tokenizerr   --tokenizer-moder   --trust-remote-coder   --dtyper   --seedr   --hf-config-pathr   --allowed-local-media-pathr   --allowed-media-domainsr   
--revisionr2  --code-revisionr3  --tokenizer-revisionr6  --max-model-lenr   --quantization-qr7  --allow-deprecated-quantizationr8  --enforce-eagerr9  --enable-return-routed-expertsr   --max-logprobsr.  --logprobs-moder/  --disable-sliding-windowr   --disable-cascade-attnr!  --skip-tokenizer-initr   --enable-prompt-embedsr   --served-model-namer   --config-formatr   z
--hf-token?Tr4  r   r   )r   r   constr   r   --hf-overridesr5  --pooler-configrk  --generation-configrv  --override-generation-configrx  --enable-sleep-moderw  --model-implry  --override-attention-dtyperz  --logits-processorsr  --io-processor-pluginrH  r!   --load-formatr   --download-dirr   --safetensors-load-strategyr   --model-loader-extra-configrV  --ignore-patternsrW  --use-tqdm-on-loadr  --pt-load-map-locationr  r   --attention-backendbackendr-   --reasoning-parserr\  --reasoning-parser-pluginr]  r'   --distributed-executor-backendr   --pipeline-parallel-size-ppr   --master-addrr   --master-portr   --nnodes-nr   --node-rank-rr   --distributed-timeout-secondsr   --tensor-parallel-size-tpr   --decode-context-parallel-size-dcpr   --dcp-comm-backendr  --dcp-kv-cache-interleave-sizer  --cp-kv-cache-interleave-sizer  --prefill-context-parallel-size-pcpr   --data-parallel-size-dpr  z--data-parallel-rankz-dpnzSData parallel rank of this instance. When set, enables external load balancer mode.)r   r   z--data-parallel-start-rankz-dprz0Starting data parallel rank for secondary nodes.z--data-parallel-size-localz-dplz5Number of data parallel replicas to run on this node.z--data-parallel-addressz-dpaz+Address of data parallel cluster head-node.z--data-parallel-rpc-portz-dppz)Port for data parallel RPC communication.z--data-parallel-backendz-dpbmpz0Backend for data parallel, either "mp" or "ray".)r   r   r   --data-parallel-hybrid-lb-dphr
  --data-parallel-external-lb-dper  --enable-expert-parallel-epr  --enable-ep-weight-filterr  --all2all-backendr  --enable-dbor  --ubatch-sizer  --enable-elastic-epr  --dbo-decode-token-thresholdr  --dbo-prefill-token-thresholdr  %--disable-nccl-for-dp-synchronizationr  --enable-eplbr  --eplb-configr  --expert-placement-strategyr  --max-parallel-loading-workersr  --ray-workers-use-nsightrT  --disable-custom-all-reducer:  --worker-clsrp  --worker-extension-clsrq  r   --block-sizer  --gpu-memory-utilizationr)  --kv-cache-memory-bytesr   --kv-cache-dtypecache_dtype--num-gpu-blocks-overriderU  --enable-prefix-cachingr  --prefix-caching-hash-algor  --calculate-kv-scalesr|  --kv-sharing-fast-prefillr  --mamba-cache-dtyper}  --mamba-ssm-cache-dtyper~  --mamba-block-sizer  --mamba-cache-moder  --kv-offloading-sizer  --kv-offloading-backendr  r&   --offload-backendr"  --cpu-offload-gbr#  --cpu-offload-paramsr$  --offload-group-sizer%  --offload-num-in-groupr&  --offload-prefetch-stepr'  --offload-paramsr(  r$   --language-model-onlyr;  --limit-mm-per-promptr<  --enable-mm-embedsr>  --media-io-kwargsr@  --mm-processor-kwargsrA  --mm-processor-cache-gbrB  --mm-processor-cache-typerC  !--mm-shm-cache-max-object-size-mbrD  --mm-encoder-onlyrE  --mm-encoder-tp-moderF  --mm-encoder-attn-backendrG  --interleave-mm-stringsr?  --skip-mm-profilingrI  --video-pruning-raterJ  r"   z--enable-loraz*If True, enable handling of LoRA adapters.)r   r   --max-lorasrL  --max-lora-rankrM  --lora-dtyperQ  --enable-tower-connector-lorarR  --max-cpu-lorasrP  --fully-sharded-lorasrO  --default-mm-lorasrN  --specialize-active-lorarS  r%   !--show-hidden-metrics-for-versionr_  --otlp-traces-endpointr`  ra  r   {,}r   c                 S   s   g | ]}d  |qS )r0  )join)r   prp   rp   rq   r   q  s    
z+EngineArgs.add_cli_args.<locals>.<listcomp>   )r--collect-detailed-traces--kv-cache-metricsrb  --kv-cache-metrics-samplerc  --cudagraph-metricsrd  --enable-layerwise-nvtx-tracingre  --enable-mfu-metricsrf  "--enable-logging-iteration-detailsrg  r+   --max-num-batched-tokensr   --max-num-seqsr-  --max-num-partial-prefillsr*  --max-long-partial-prefillsr+  --long-prefill-token-thresholdr,  --scheduling-policypolicy--enable-chunked-prefillrX  --disable-chunked-mm-inputrY  --scheduler-clsrj  !--disable-hybrid-kv-cache-managerrZ  --async-schedulingr  --stream-intervalr  r   --cudagraph-capture-sizesr   --max-cudagraph-capture-sizer   r   --enable-flashinfer-autotunero  r  c                 S   s   |   ddS )N-_)lowerr   )srp   rp   rq   <lambda>  s    z)EngineArgs.add_cli_args.<locals>.<lambda>r   --moe-backendr/   r^  --speculative-config--kv-transfer-configrs  --kv-events-configrt  --ec-transfer-configru  --compilation-config-ccrl  --attention-config-acrm  --kernel-configrn  --additional-configr  --structured-outputs-configr[  --profiler-configrr  --optimization-levelr  --performance-moder  --weight-transfer-configr  z--disable-log-stats
store_truezDisable logging statistics.z--aggregate-engine-loggingzLLog aggregate rather than per-engine statistics when using data parallelism.z--fail-on-environ-validationzGIf set, the engine will raise an error if environment validation fails.F)r   r   r   z--shutdown-timeoutr   z2Shutdown timeout in seconds. 0 = abort, >0 = wait.z--gdn-prefill-backendr  r  r  zSelect GDN prefill backend.)destr   r   r   )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  r  )r  )r  )r  r  )r  r  )r  )r  r  )r  r  )r  )r  )r  )r  r  )r  r  )r  r  )r  r  )r  r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r   )r  )r  )r  )r  )r  )r  )r  )r	  )r
  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r   )r!  )r"  )r#  )r$  )r%  )r&  )r'  )r(  )r)  )r*  )r+  )r,  )r-  )r.  )r6  )r7  )r8  )r9  )r:  )r;  )r<  )r=  )r>  )r?  )r@  )rA  )rB  )rD  )rE  )rF  )rG  )rH  )rI  )rJ  )rK  )rL  )rR  )rS  )rT  )rU  )rV  )rW  rX  )rY  rZ  )r[  )r\  )r]  )r^  )r_  )r`  )ra  )!r   r#   add_argument_groupr   sysargvadd_argumentrt   r!   r   r-   r'   r   r   r&   r.   r)   r$   r"   rl   r   r%   r2  r   r   rB   r+   r   r   r/   rz   r}   r~   )"r  model_kwargsmodel_groupload_kwargs
load_groupattention_kwargsattention_groupstructured_outputs_kwargsstructured_outputs_groupparallel_kwargsparallel_groupcache_kwargscache_groupoffload_kwargs
uva_kwargsprefetch_kwargsoffload_groupmultimodal_kwargsmultimodal_grouplora_kwargs
lora_groupobservability_kwargsobservability_groupr   r   scheduler_kwargsscheduler_groupcompilation_kwargscompilation_groupkernel_kwargskernel_groupmoe_backend_kwargsvllm_kwargs
vllm_grouprp   rp   rq   add_cli_args  sb  $




zEngineArgs.add_cli_argsr   c                    s4   dd t | D }| di  fdd|D }|S )Nc                 S   s   g | ]}|j qS rp   )r   r   attrrp   rp   rq   r   ;  s    z,EngineArgs.from_cli_args.<locals>.<listcomp>c                    s"   i | ]}t  |r|t |qS rp   )hasattrgetattrr  r   rp   rq   
<dictcomp>>  s   " z,EngineArgs.from_cli_args.<locals>.<dictcomp>rp   )dataclassesr   )r   r   attrsengine_argsrp   r  rq   from_cli_args8  s
   zEngineArgs.from_cli_argsc                 C   s  t | jrd | _| _tjstd| j t	d7i d| jd| j
d| jd| jd| jd| jd	| jd
| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| j d| j!d| j"d| j#d | j$d!| j%d"| j&d#| j'd$| j(d%| j)d&| j*d'| j+d(| j,d)| j-d*| j.d+| j/d,| j0d-| j1d.| j2d/| j3d0| j4d1| j5d2| j6d3| j7d4| j8d5| j9d6| j:S )8NggufzThe global random seed is set to %d. Since VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may affect the random state of the Python process that launched vLLM.r   r   r   r   r   r   r   r   r   r   r   r   r2  r3  r4  r5  r6  r   r7  r8  r9  r   r.  r/  r   r!  r   r   r   r;  r=  r>  r?  r@  rI  r   rA  rB  rC  rD  rE  rF  rG  rk  rv  rx  rw  ry  rz  r  rJ  rH  rp   );rV   r   r7  r   envsVLLM_ENABLE_V1_MULTIPROCESSINGr  warningr   r#   r   r   r   r   r   r   r   r   r   r   r2  r3  r4  r5  r6  r   r8  r9  r   r.  r/  r   r!  r   r   r   r;  r=  r>  r?  r@  rI  r   rA  rB  rC  rD  rE  rF  rG  rk  rv  rx  rw  ry  rz  r  rJ  rH  r  rp   rp   rq   create_model_configB  s   
	
 !"#$%&'()*+,-./01234zEngineArgs.create_model_configc                 C   s:   ddl m} | jD ]}||jv r| j| | jd |< q	d S )Nr   )TensorizerConfigtensorizer_config)+vllm.model_executor.model_loader.tensorizerr  rV  _fields)r  r  keyrp   rp   rq   validate_tensorizer_args  s   

z#EngineArgs.validate_tensorizer_argsc              	   C   sx   | j dkrd| _| jdkr*t| jdr| j | _i | jd< | j| jd d< |   t| j| j| j	| j| j
| j| jdS )Nbitsandbytes
tensorizerto_serializabler  tensorizer_dir)r   r   r   rV  rW  r  r  )r7  r   r  rV  r  r   r  r!   r   r   rW  r  r  r  rp   rp   rq   create_load_config  s&   


zEngineArgs.create_load_configtarget_model_configtarget_parallel_configc                 C   s0   | j du rdS | j ||d tdi | j S )a[  Initializes and returns a SpeculativeConfig object based on
        `speculative_config`.

        This function utilizes `speculative_config` to create a
        SpeculativeConfig object. The `speculative_config` can either be
        provided as a JSON string input via CLI arguments or directly as a
        dictionary from the engine.
        Nr  r  rp   )r^  r   r,   )r  r  r  rp   rp   rq   create_speculative_config  s   
z$EngineArgs.create_speculative_configusage_contextheadlessc                  C   s	  t   tttt jd}t| j t	| j
s,t| j
| j| j| j| jd\| _
| _| _|  }|j
| _
|j| _|j| _|   | | | || d}t|jsV| }t| j|}| jduseJ dtdi d| jd| jd| jd|d	|jd
| j d|d| jd| j!d| j"d| j#d| j$d| j%d| j&d| j'd| j(d| j)}d}t* rddl+}	|	, j-}|r|. ni }
d|
v rdd |
d D |
d< t/0d|
 d}t1 rddl+}	|	j23 }|r| j4rJ d| j4r| j5rJ d| j6dks| j7dksJ dd}| j7dkrx| j8| j9 | j: }| j9| j: }|| j7 }|| j7 dks8J d | d!| j7 d"| j;| j7k sLJ d#| j; d$| j7 d"| j;| | }| j8dkrj| j5rj|| _<t/0d%| j<| j; n| j=du rxt>|| d| _=| j5p| j<du}|r| j<dusJ d&| j=d'v sJ d(d}d)| _4n`| j=dur| j=}| j?r|sd*| _4| j4r|dkrt/@d+ d*}d)| _4|| j8krd)| _4| j?p|| _<| j7dkrt/0d,| j<| j; n| j4rJ d-| j6d.krtjAd/krd}n| j8}| jBdu r+| j6d.krtC }t/0d0| |}n| j6dks#J d1| j6f| jDp)tEjF}n| jB}| jGdur7| jGntEjG}| jHrJ|jIsJd*|_It/0d2 tEdi d3| j9d4| j:d5| jJd6| j8d7| j<pcdd8|d9|d:| jDd;| jKd<| j7d=| j;d>| jLd?|d@|dA| j6dB| j4dC|jMdD| jNdE| jOdF| jPdG| jQdH| jRdI| jSdJ| jTdK| jUdL| jVdM| jWdN| jXdO| jYdP| jZdQ| j[dR| j\dS|dT|dU| j]dV| j^dW| j_dX| j`dY| jadZ| jbd[| jcd\| jdd]| je}| jf||d^}| jgdus	J d_| jhdusJ d`| jidusJ da|jjdus'J dbtkdi dc|jldd| jgde| jhdf|jjdg| jidh| jmdi|jndj|jodk| jpdl| jqdm| jrdn| jsdo| jtdp| judq| jvdr| jw}|jnsx| jxrxtyds| jzrt{| j|| j}| jx| j~| j| j| j| jr| jdkr| jnddtnd}|dur|dur|jg|jh|jd  k rtydu|jdvkrdv | _| _t| j}| jdur|jdurtydwt| j|_t| j}| jdur|jdurtydx| j|_| jdykr| j|_|  }| jr| j| j_| jr| j| j_t| j| j| j| j| j| j| j| j| j| jdz
}t| j}| jdurM|jdurItyd{| j|_| jdura|jdur]tyd|| j|_t| jt| j| jd}t| j| j| j| jd~d}| jdur| j| jd< tdi d|d|d|d|d|d|d|d|d|d|d|d| jd|d|d| jd| jd| jd| jd| jd| jd| jd| jd| j}|S )zi
        Create the VllmConfig.

        NOTE: If VllmConfig is incompatible, we raise an error.
        )device)r   r   r2  r   vllm_speculative_configNz/enable_prefix_caching must be set by this pointr  r)  r   r  is_attention_freerU  sliding_windowr  r  r|  r  r}  r~  r  r  r  r  r   env_varsc                 S   s   i | ]}|d qS )z***rp   )r   krp   rp   rq   r    s    z3EngineArgs.create_engine_config.<locals>.<dictcomp>z-Using ray runtime env (env vars redacted): %sz:data_parallel_hybrid_lb is not applicable in headless modezJdata_parallel_hybrid_lb and data_parallel_external_lb cannot both be True.r  r  z:nnodes > 1 is only supported with data_parallel_backend=mpzworld_size=z must be divisible by nnodes=rj   z
node_rank=z must be less than nnodes=z@Inferred data_parallel_rank %d from node_rank %d for external lbzYdata_parallel_rank or node_rank must be specified if data_parallel_external_lb is enable.)r  NzIdata_parallel_size_local must be 1 or None when data_parallel_rank is setFTzsdata_parallel_hybrid_lb is not eligible when data_parallel_size_local = 1, autoswitch to data_parallel_external_lb.z0Inferred data_parallel_rank %d from node_rank %dzDdata_parallel_size_local must be set to use data_parallel_hybrid_lb.rayspanz3Using host IP %s as ray-based data parallel addressz3data_parallel_backend can only be ray or mp, got %sz7Skipping tokenizer initialization for tokens-only mode.r   r   r   r  r  r  r  r   r   r   r   r   data_parallel_master_ipr	  r  r
  is_moe_modelr  r  r  r  r  r  r  r  r  r  r  r  r  r:  rT  ray_runtime_envplacement_groupr   rp  rq  r   r  r  r  r  r  r  z0max_num_batched_tokens must be set by this pointz&max_num_seqs must be set by this pointz0enable_chunked_prefill must be set by this point'max_model_len must be set by this pointrunner_typer   r-  r   rX  rY  is_multimodal_modelis_encoder_decoderrC  rj  r*  r+  r,  rZ  r  r  zJDefault modality-specific LoRA(s) were provided for a non multimodal model)rM  rL  rN  rO  rQ  rR  rS  rP  zOConsider increasing max_num_batched_tokens or decreasing num_speculative_tokensr  zEattention_backend and attention_config.backend are mutually exclusivez^enable_flashinfer_autotune and kernel_config.enable_flashinfer_autotune are mutually exclusiveauto)
r_  r`  ra  rb  rc  rd  re  rf  rh  rg  z]cudagraph_capture_sizes and compilation_config.cudagraph_capture_sizes are mutually exclusivezcmax_cudagraph_capture_size and compilation_config.max_cudagraph_capture_size are mutually exclusive)r#  r$  )r%  r&  r'  r(  )r"  uvaprefetchr  model_configcache_configparallel_configscheduler_configdevice_configload_configoffload_configrm  rn  lora_configr^  r[  observability_configrl  rs  rt  ru  rr  r  r  r  r  r  rp   )rO   pre_register_and_updater   r   r7   device_typer  validate_environr  rX   r   rU   r   r2  r   r^  r  r   _check_feature_supported4_set_default_chunked_prefill_and_prefix_caching_args1_set_default_max_num_seqs_and_batched_tokens_argsrT   hf_text_configget_sliding_windowr\   r   r  r   r  r)  r   r  rU  r  r|  r  r}  r~  r  r  r  r  rS   r  get_runtime_contextruntime_envto_dictr  r  rR   utilget_current_placement_groupr
  r  r  r   r  r   r   r   r  r  maxr  r  VLLM_RAY_DP_PACK_STRATEGYr  r[   r   r'   r  r	  r  r   r   r   r   is_moer  r  r  r  r  r  r  r  r  r  r  r  r  r:  rT  r   rp  rq  r   r  r  r  r  r  r  r   r-  rX  r   r+   r  rY  r  r  ri  rj  r*  r+  r,  rZ  r  r  rN  rk   rK  r"   rM  rL  rO  rQ  rR  rS  rP  num_speculative_tokensr7  r   r   r   rm  r{  r  r   validate_backend_beforern  ro  r  r  r\  r[  r]  r%   r_  r`  ra  rb  rc  rd  re  rf  rh  rg  rl  r   r   r&   r"  r.   r#  r$  r)   r%  r&  r'  r(  r  r  r/   rs  rt  ru  rr  r  r  r  r  ) r  r  r  r  r  r  resolved_cache_dtyper  r  r  sanitized_envr  inferred_data_parallel_rank
world_sizeworld_size_within_dplocal_world_sizer  r  host_ipr  r	  r  r^  r  r  rm  rn  r  r  rl  r  configrp   rp   rq   create_engine_config  s  




	









	
 !"#$%&'()*+.	



	
zEngineArgs.create_engine_configc                 C   sp   | j tj ks| jtjkrtdd | jdkr2t| jdd}|s4| jtjdddfvr6d	}t|d d
S d
S d
S d
S )z/Raise an error if the feature is not supported.zConcurrent Partial Prefill)feature_namer  supports_ppFr  r  external_launcherzfPipeline Parallelism without Ray distributed executor or multiprocessing executor or external launcherN)r*  r+   r+  _raise_unsupported_errorr   r  r   r'   )r  r  r   rp   rp   rq   r    s(   

z#EngineArgs._check_feature_supportedr  c                 C   s2  ddl m} zt }t  }W n ty   d}d}Y nw |dt kr;d|vr;|jd|j	di}|jd|j	di}n|jd|j	d	i}|jd
|j	d
i}t
 ryt }|dkr`|jd	|j	di}n|dkrm|jd|j	di}n|dkry|jd|j	d
i}t r|jd| |j	d	| i}|jd
| |j	d| i}||fS )Nr   ra   rv   F   a100i @  i       i      V6EV5Ei   V5Pi      )vllm.usage.usage_librb   rO   get_device_total_memoryget_device_namerO  	ExceptionrZ   	LLM_CLASSOPENAI_API_SERVERis_tpuis_cpu)r   r  rb   device_memorydevice_namedefault_max_num_batched_tokensdefault_max_num_seqs	chip_namerp   rp   rq   get_batch_defaults  sV   



zEngineArgs.get_batch_defaultsr  c                 C   s  |j }|j}| jd u r|| _td|rdnd n#|jdkr+| js+|r+tjddd n|jdkr<| jr<|s<tjd	dd | jd u rO|| _td
|rKdnd n|jdkr`| jr`|s`tjddd t	 r~t
 tjfv rtd d| _td d| _d S d S d S )Nz%s chunked prefill by defaultEnabling	DisablinggeneratezThis model does not officially support disabling chunked prefill. Disabling this manually may cause the engine to crash or produce incorrect outputs.local)scopepoolingzThis model does not officially support chunked prefill. Enabling this manually may cause the engine to crash or produce incorrect outputs.z%s prefix caching by defaultzThis model does not officially support prefix caching. Enabling this manually may cause the engine to crash or produce incorrect outputs.zMChunked prefill is not supported forRISC-V CPUs; disabling it for V1 backend.FzMPrefix caching is not supported for RISC-V CPUs; disabling it for V1 backend.)is_chunked_prefill_supportedis_prefix_caching_supportedrX  r  debugr  warning_oncer  rO   r  get_cpu_architecturerN   RISCVr  )r  r  default_chunked_prefilldefault_prefix_cachingrp   rp   rq   r    sj   






	
z?EngineArgs._set_default_chunked_prefill_and_prefix_caching_argsc                 C   s8  | j | j }| |\}}| j}| j}| jd u r ||tj| _| jd u r-||tj| _| j	dkrH|d u r=|  jd9  _|d u rH|  jd9  _|d u rx|j
d usUJ d| js`t|j
| j| _t| j|j
 | j| _td| j|ru|jnd  |d u r| jd usJ t| j| j| _td| j|r|jnd  d S d S )N
throughputr4  r  z=Defaulting max_num_batched_tokens to %d for %s usage context.z3Defaulting max_num_seqs to %d for %s usage context.)r   r   r  r   r-  r   r+   DEFAULT_MAX_NUM_BATCHED_TOKENSDEFAULT_MAX_NUM_SEQSr  r   rX  r  minr  r  value)r  r  r  r  r  r  orig_max_num_batched_tokensorig_max_num_seqsrp   rp   rq   r  W  sb   



z<EngineArgs._set_default_max_num_seqs_and_batched_tokens_args)NF(  __name__r   __qualname__r   r#   r   rt   __annotations__r   r   r   r   r   r   r   r   r>   r   r:   r   r   r   r?   r   r   r   r!   r   r   r   r`   r   r   r=   r   r  r   r2   r   r   r   r   r   rI   r   r'   r   rF   r   rc   r   r   r   r   r   r   r   r   r   r  rE   r  r  r  r  r  r  r  r	  r
  r  r  rD   r  r  r   r  r8   r  rC   r  r  r  r  r  r  r  r   r  r  rG   r  r  r  r  r  r  r6   r   r!  r&   r"  r.   r#  r   r$  r   r)   r%  r&  r'  r(  r)  r   r   r+   r*  r+  r,  r-  r.  r/  r<   r0  r1  r2  r3  r4  r5  r;   r6  r7  r_   r8  r9  r:  r$   r;  r=  r   r>  r?  r@  r   rA  rB  rC  r@   rD  rE  rF  rA   rG  r]   rH  rI  rJ  rK  r"   rL  rM  r9   rN  rO  rP  rQ  torchrR  rS  rT  rU  rV  rW  rX  rY  rZ  r/   r[  r-   r\  r]  r^  r%   r_  r`  ra  rB   rb  rc  rd  re  rf  rg  rh  rC  ri  rH   rj  objectrk  r(   rl  rm  r   rn  ro  rp  rq  rr  r*   rs  r    rt  r   ru  r   rv  rw  rx  ry  rz  r  r{  r|  r}  r5   r~  r  r  r4   r  r  r  r  r^   r  r  r  r  rJ   r  rK   r  r  r3   r  r  r  r0   r  r  r   r  staticmethodrY   r  classmethodrl   	Namespacer  r  r  r  r,   r  rb   r  r  r   r  r  r  rp   rp   rp   rq   r   i  s  
 

 


)     '	
E	   
^(SIr   c                   @   s<   e Zd ZU dZdZeed< e	d
dededefddZ	d	S )AsyncEngineArgsz'Arguments for asynchronous vLLM engine.Fenable_log_requestsr  async_args_onlyrh   c                 C   s8   t   |s
t| } | jdtjtjdd t	|  | S )Nz--enable-log-requestszEnable logging request information, dependent on log level:
- INFO: Request ID, parameters and LoRA request.
- DEBUG: Prompt inputs (e.g: text, token IDs).
You can set the minimum log level via `VLLM_LOGGING_LEVEL`.)r   r   r   )
rQ   r   r  rg  rl   r   r  r  rO   r  )r  r  rp   rp   rq   r    s   

	zAsyncEngineArgs.add_cli_argsN)F)
r  r   r  r   r  r   r  r  rY   r  rp   rp   rp   rq   r    s   
 r  r  c                 C   s   |  d|  d}t |)Nz* is not supported. We recommend to remove z from your config.)NotImplementedError)r  msgrp   rp   rq   r    s
   r  r  c                 C   s   |   } td| }|r]ddddd}ddd	d
d}| \}}||v r0|| }tt|| S ||v r]|| }zt|| W S  ty\ } ztd| d| |	  d|d}~ww t| S )zParse human-readable integers like '1k', '2M', etc.
    Including decimal values with decimal multipliers.

    Examples:
    - '1k' -> 1,000
    - '1K' -> 1,024
    - '25.6k' -> 25,600
    z(\d+(?:\.\d+)?)([kKmMgGtT])i  i@B i ʚ;l    J))r  mgr   r  i   i   @l        )KMGrd   z3Decimals are not allowed with binary suffixes like z. Did you mean to use z	 instead?N)
r   r{   	fullmatchgroupsr   r   rk   rl   rm   rO  )r  r|   decimal_multiplierbinary_multipliernumbersuffixmultrn   rp   rp   rq   r     sH   	r   c                 C   s(   |   } | dks|  dkrdS t| S )aA  Parse human-readable integers like '1k', '2M', etc.
    Including decimal values with decimal multipliers.
    Also accepts -1 or 'auto' as a special value for auto-detection.

    Examples:
    - '1k' -> 1,000
    - '1K' -> 1,024
    - '25.6k' -> 25,600
    - '-1' or 'auto' -> -1 (special value for auto-detection)
    z-1r  )r   rO  r   )r  rp   rp   rq   r     s   r   )rl   r   r  	functoolsr}   re  collections.abcr   r   r   r   r   	itertoolsr   r   r   typingr	   r
   r   r   r   r   r   r   r   r   r  regexr{   r  pydanticr   r   pydantic.fieldsr   typing_extensionsr   	vllm.envsr  vllm.configr   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   vllm.config.cacher2   r3   r4   r5   r6   vllm.config.devicer7   vllm.config.kernelr8   vllm.config.lorar9   vllm.config.modelr:   r;   r<   r=   r>   r?   vllm.config.multimodalr@   rA   vllm.config.observabilityrB   vllm.config.parallelrC   rD   rE   rF   rG   vllm.config.schedulerrH   vllm.config.utilsrI   vllm.config.vllmrJ   rK   vllm.loggerrL   rM   vllm.platformsrN   rO   r  rQ   vllm.ray.lazy_utilsrR   rS   vllm.transformers_utils.configrT   rU   "vllm.transformers_utils.gguf_utilsrV   "vllm.transformers_utils.repo_utilsrW   vllm.transformers_utils.utilsrX   vllm.utils.argparse_utilsrY   vllm.utils.mem_constantsrZ   vllm.utils.network_utilsr[   vllm.utils.torch_utilsr\   #vllm.v1.attention.backends.registryr]   vllm.v1.sample.logits_processorr^   'vllm.model_executor.layers.quantizationr_    vllm.model_executor.model_loaderr`   r  rb   vllm.v1.executorrc   r  r  rd   r   r  re   r  rf   rt   ru   rz   r   r   r   r   r   r   r   r   r   r   r   r   rf  argv0endswithr   	lru_cacher   r   r   r  r  r   r   r   rp   rp   rp   rq   <module>   s   
0t &*"	"
$"e              @.