o
    :/i3                     @   sV  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( ee)Z*dZ+G dd deZ,de-e fddZ.de j/fddZ0de j/fddZ1	d	dddZ2dS )    N)CLISubcommand)
run_serverrun_server_workersetup_server)make_arg_parservalidate_parsed_serve_args)VLLM_SUBCMD_PARSER_EPILOG)init_logger)UsageContext)FlexibleArgumentParser)get_tcp_uri)decorate_logsset_process_title)CoreEngineProcManagerlaunch_core_engines)Executor)MultiprocExecutor)setup_multiprocess_prometheus)APIServerProcessManagerwait_for_completion_or_failurea0  Launch a local OpenAI-compatible API server to serve LLM
completions via HTTP. Defaults to Qwen/Qwen3-0.6B if no model is specified.

Search by using: `--help=<ConfigGroup>` to explore options by section (e.g.,
--help=ModelConfig, --help=Frontend)
  Use `--help=all` to show all available flags at once.
c                   @   sT   e Zd ZdZdZedejddfddZdejddfdd	Z	d
ej
defddZdS )ServeSubcommandz(The `serve` subcommand for the vLLM CLI.serveargsreturnNc                 C   sL  t | dr| jd ur| j| _t| ddr#ddlm} t||  d S | jr<| j	d ur9| j	dkr9t
d| j	 dd| _	| jpC| jd u}| jpK| jd u}|rT|rTt
d| j	d u r|r_d	| _	n%|rt| jped	| _	| j	d	krstd
| j	 n| j| _	| j	d	krtd| j	 | j	d	k rt|  d S | j	d	krt|  d S d | _	tt|  d S )N	model_taggrpcFr   )
serve_grpcz--api-server-count=zN cannot be used with --headless (no API servers are started in headless mode).a  Cannot use both external and hybrid data parallel load balancing modes. External LB is enabled via --data-parallel-external-lb or --data-parallel-rank. Hybrid LB is enabled via --data-parallel-hybrid-lb or --data-parallel-start-rank. Use one mode or the other.   zPDefaulting api_server_count to data_parallel_size_local (%d) for hybrid LB mode.z7Defaulting api_server_count to data_parallel_size (%d).)hasattrr   modelgetattrvllm.entrypoints.grpc_serverr   uvlooprunheadlessapi_server_count
ValueErrordata_parallel_external_lbdata_parallel_rankdata_parallel_hybrid_lbdata_parallel_start_rankdata_parallel_size_localloggerinfodata_parallel_sizerun_headlessrun_multi_api_serverr   )r   r   is_external_lbis_hybrid_lb r3   g/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/entrypoints/cli/serve.pycmd/   sV   




zServeSubcommand.cmdc                 C   s   t | d S N)r   )selfr   r3   r3   r4   validatex   s   zServeSubcommand.validate
subparsersc                 C   sB   |j | jdtdd}t|}|jddddd tj| jd	|_|S )
NzNLaunch a local OpenAI-compatible API server to serve LLM completions via HTTP.z vllm serve [model_tag] [options])helpdescriptionusagez--grpc
store_trueFzdLaunch a gRPC server instead of the HTTP OpenAI-compatible server. Requires: pip install vllm[grpc].)actiondefaultr:   )subcmd)
add_parsernameDESCRIPTIONr   add_argumentr   formatepilog)r7   r9   serve_parserr3   r3   r4   subparser_init{   s   zServeSubcommand.subparser_init)__name__
__module____qualname____doc__rB   staticmethodargparse	Namespacer5   r8   _SubParsersActionr   rH   r3   r3   r3   r4   r   *   s    Hr   r   c                   C   s   t  gS r6   )r   r3   r3   r3   r4   cmd_init   s   rQ   r   c              
      s  | j dkr	tdtj| }tj}|j|dd}|jr td|j	}|j
}|dkr.tdd  fd	d
}ttj| ttj| |jdkroddlm} |j}| d|j }	td||	 t|dd}
|
jdd d S |j}|j}t||}td|| t||j	jd|d|t||j d}z|  W d } r|j }td| |j!|d td d S d } r|j }td| |j!|d td w )Nr   z.api_server_count can't be set in headless modeT)usage_contextr$   z:data_parallel_hybrid_lb is not applicable in headless moder   z5data_parallel_size_local must be > 0 in headless modeFc                       t d|   sd td S NzReceived %d signal.Tr,   debug
SystemExitsignumframeshutdown_requestedr3   r4   signal_handler   
   z$run_headless.<locals>.signal_handler)__version__:zpLaunching vLLM (v%s) headless multiproc executor, with head node address %s for torch.distributed process group.)monitor_workers)inlinezQLaunching %d data parallel engine(s) in headless mode, with head node address %s.)local_engine_countstart_indexlocal_start_indexvllm_configlocal_clienthandshake_addressexecutor_class	log_stats.Waiting up to %d seconds for processes to exittimeoutzShutting down.)"r%   r&   vllmAsyncEngineArgsfrom_cli_argsr
   OPENAI_API_SERVERcreate_engine_configr)   parallel_configr+   signalSIGTERMSIGINTnode_rank_within_dpvllm.versionr_   master_addrmaster_portr,   r-   r   start_worker_monitordata_parallel_master_ipdata_parallel_rpc_portr   r   r(   r   	get_classdisable_log_stats
join_firstshutdown_timeoutshutdown)r   engine_argsrR   rf   rs   rc   r]   VLLM_VERSIONhosthead_node_addressexecutorportrh   engine_managerrm   r3   r[   r4   r/      sz   



r/   c                    s  | j rJ | j}|dksJ |dkrt  d  fdd}ttj| ttj| t| \}}tj	| }||_
d|_tj}|j|d}|dkrQtjrQtdt|}|j }	|j}
|
j}|
jsi|dksiJ d }dd	lm} |||}t|||	||-\}}}tt||| ||j|j|r| nd d
}|dks|
jst di |}W d    n1 sw   Y  |d u r|j!|d< t di |}zIt"|||d W d  }} r|j#}t$% | }t&'d| dt(d B dt(d B fdd}|j)|d |r|j)||d |r|j)||d d S d S d  }} r#|j#}t$% | }t&'d| dt(d B dt(d B fdd}|j)|d |rA|j)||d |rM|j)||d w w )Nr   r   Fc                    rS   rT   rU   rX   r[   r3   r4   r]      r^   z,run_multi_api_server.<locals>.signal_handler)rR   zIVLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used with api_server_count > 1)get_engine_zmq_addresses)target_server_fnlisten_addresssockr   num_serversinput_addressesoutput_addressesstats_update_addressr   )api_server_managerr   coordinatorrk   deadliner   c                 S   s   | d u r| S t | t  dS )Ng        )maxtime	monotonic)r   r3   r3   r4   
to_timeoutI  s   
z(run_multi_api_server.<locals>.to_timeoutrl   r3   )*r$   r%   r   rt   ru   rv   r   rn   ro   rp   _api_process_count_api_process_rankr
   rq   rr   envs VLLM_ALLOW_RUNTIME_LORA_UPDATINGr&   r   r~   r   rs   r(   local_engines_onlyvllm.v1.engine.utilsr   r   dictrun_api_server_worker_procinputsoutputsget_stats_publish_addressr   frontend_stats_publish_addressr   r   r   r   r,   r-   floatr   )r   num_api_serversr]   r   r   r   rR   rf   ri   rj   rs   dp_rankr   r   	addresseslocal_engine_managerr   api_server_manager_kwargsrm   shutdown_byr   r3   r[   r4   r0      s   




r0   c                 K   sH   |pi }| dd}tdt| t  tt| |||fi | dS )z6Entrypoint for individual API server worker processes.client_indexr   	APIServerN)getr   strr   r"   r#   r   )r   r   r   client_configuvicorn_kwargsserver_indexr3   r3   r4   r   U  s   r   r6   )r   N)3rN   rt   r   r"   rn   	vllm.envsr   vllm.entrypoints.cli.typesr   "vllm.entrypoints.openai.api_serverr   r   r    vllm.entrypoints.openai.cli_argsr   r   vllm.entrypoints.utilsr   vllm.loggerr	   vllm.usage.usage_libr
   vllm.utils.argparse_utilsr   vllm.utils.network_utilsr   vllm.utils.system_utilsr   r   r   r   r   vllm.v1.executorr   #vllm.v1.executor.multiproc_executorr   vllm.v1.metrics.prometheusr   vllm.v1.utilsr   r   rI   r,   rC   r   listrQ   rO   r/   r0   r   r3   r3   r3   r4   <module>   s<   	hTl