o
    :/i4                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlZd dlmZ d dlmZmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZmZmZmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' e#e(Z)dZ*deddfddZ+dd Z,defddZ-dd Z.dd Z/	d>de0de0dB de0d e1d!e0dB de0fd"d#Z2d$eeB fd%d&Z3d'd(d)e4de5e4e4f fd*d+Z6d,e7e  d-e1e8e8f dB de7e  fd.d/Z9d0e8de8fd1d2Z:d3e	d4e8d5e8ddfd6d7Z;d8ej<dfd0e8e=B d9e8d:ed;e8dB def
d<d=Z>dS )?    N)	Namespace)
HTTPStatus)Logger)Template)Request)JSONResponseStreamingResponse)BackgroundTaskBackgroundTasks)envs)
EngineArgs)	ErrorInfoErrorResponseGenerationErrorStreamOptionsLoRAModulePath)current_formatter_typeinit_logger)current_platform)FlexibleArgumentParsera  For full list:            vllm {subcmd} --help=all
For a section:            vllm {subcmd} --help=ModelConfig    (case-insensitive)
For a flag:               vllm {subcmd} --help=max-model-len  (_ or - accepted)
Documentation:            https://docs.vllm.ai
requestreturnc                    sT   	 |   I dH }|d dkr)t| jjddr't| jjdr'| jj jd8  _dS q)	z+Returns if a disconnect message is receivedTNtypezhttp.disconnectenable_server_load_trackingFserver_load_metrics   )receivegetattrappstatehasattrr   )r   message r#   c/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/entrypoints/utils.pylisten_for_disconnect)   s   
r%   c                       t   fdd}|S )a  Decorator that allows a route handler to be cancelled by client
    disconnections.

    This does _not_ use request.is_disconnected, which does not work with
    middleware. Instead this follows the pattern from
    starlette.StreamingResponse, which simultaneously awaits on two tasks- one
    to wait for an http disconnect message, and the other to do the work that we
    want done. When the first task finishes, the other is cancelled.

    A core assumption of this method is that the body of the request has already
    been read. This is a safe assumption to make for fastapi handlers that have
    already parsed the body of the request into a pydantic model for us.
    This decorator is unsafe to use elsewhere, as it will consume and throw away
    all incoming messages for the request while it looks for a disconnect
    message.

    In the case where a `StreamingResponse` is returned by the handler, this
    wrapper will stop listening for disconnects and instead the response object
    will start listening for disconnects.
    c                     s   t | dkr| d n|d }t | i |}tt|}tj||gtjdI d H \}}|D ]}|  q1||v r@| S d S )Nr   raw_request)return_when)lenasynciocreate_taskr%   waitFIRST_COMPLETEDcancelresult)argskwargsr   handler_taskcancellation_taskdonependingtaskhandler_funcr#   r$   wrapperP   s   

z"with_cancellation.<locals>.wrapper	functoolswraps)r8   r9   r#   r7   r$   with_cancellation8   s   r=   c                 C   s   | j j jd8  _d S )Nr   )r   r    r   )r   r#   r#   r$   decrement_server_loade   s   r>   c                    r&   )Nc                     sd  | dt| dkr| d nd }|d u rtdt|jjdds+ | i |I d H S t|jjds7d|jj_|jj jd7  _z | i |I d H }W n ty]   |jj jd8  _ w t	|t
tfr|jd u rrtt||_|S t	|jtr|jt| |S t	|jtrt }|j|jjg|jjR i |jj |t| ||_|S |jj jd8  _|S )Nr'   r   z9raw_request required when server load tracking is enabledr   Fr   r   )getr)   
ValueErrorr   r   r    r!   r   	Exception
isinstancer   r   
backgroundr	   r>   r
   add_taskfuncr0   r1   )r0   r1   r'   responsetasksrE   r#   r$   r9   j   sL    

z load_aware_call.<locals>.wrapperr:   )rE   r9   r#   rH   r$   load_aware_calli   s   +rI   c                   C   s&   dt jvrtd dt jd< d S d S )NVLLM_WORKER_MULTIPROC_METHODz/Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'spawn)osenvironloggerdebugr#   r#   r#   r$   cli_env_setup   s   

rP   max_model_len
max_tokensinput_lengthdefault_sampling_paramsoverride_max_tokensc                 C   s`   | |k rt d| d|  d| | }t|}|d ur|n|d}tdd ||||fD S )NzInput length (z*) exceeds model's maximum context length (z).rR   c                 s   s    | ]	}|d ur|V  qd S Nr#   ).0valr#   r#   r$   	<genexpr>   s    z!get_max_tokens.<locals>.<genexpr>)r@   r   get_max_output_tokensr?   min)rQ   rR   rS   rT   rU   model_max_tokensplatform_max_tokensfallback_max_tokensr#   r#   r$   get_max_tokens   s&   
r_   r0   c           
      C   s   ddl m} i }t| tr/|t }t|g  D ]\}}|t| |kr-t| |||< qn8t| t	rct	| j
d}t| D ]}t| |j}t||j}	||	krV|||j< q?|j
t	j
krb|j
|d< ntdtd| d S )Nr   )make_arg_parser)modelra   zDUnsupported argument type. Must be Namespace or EngineArgs instance.znon-default args: %s) vllm.entrypoints.openai.cli_argsr`   rB   r   r   vars
parse_argsitemsr   r   ra   dataclassesfieldsname	TypeErrorrN   info)
r0   r`   non_default_argsparserargdefaultdefault_argsfieldcurrent_valdefault_valr#   r#   r$   log_non_default_args   s0   




rs   stream_optionszStreamOptions | Noneenable_force_include_usagec                 C   s6   | r| j p|}|ot| j}||fS |d}}||fS )NF)include_usageboolcontinuous_usage_stats)rt   ru   rv   include_continuous_usager#   r#   r$   should_include_usage   s   

rz   args_lora_modulesdefault_mm_lorasc                    sF   ddl m  | }|r! fdd| D }| d u r|}|S ||7 }|S )Nr   r   c                    s   g | ]
\}} ||d qS ))rh   pathr#   )rW   modality	lora_pathr   r#   r$   
<listcomp>   s    z(process_lora_modules.<locals>.<listcomp>)&vllm.entrypoints.openai.models.servingr   re   )r{   r|   lora_modulesdefault_mm_lora_pathsr#   r   r$   process_lora_modules   s   
r   r"   c                 C   s   t dd| S )Nz at 0x[0-9a-f]+>>)resub)r"   r#   r#   r$   sanitize_message  s   r   lgrversion
model_namec                 C   sb   t jst|  }d u rd}ntd}ddddd}|dkr#t|d	}||}| ||| d S )
Nz(vLLM server version %s, serving model %su+  
       ${w}█     █     █▄   ▄█${r}
 ${o}▄▄${r} ${b}▄█${r} ${w}█     █     █ ▀▄▀ █${r}  version ${w}%s${r}
  ${o}█${r}${b}▄█▀${r} ${w}█     █     █     █${r}  model   ${w}%s${r}
   ${b}▀▀${r}  ${w}▀▀▀▀▀ ▀▀▀▀▀ ▀     ▀${r}
z[97;1mz[93mz[94mz[0m)wobrcolor )r   VLLM_DISABLE_LOG_LOGOr   r   dictfromkeys
substituterj   )r   r   r   	formatterr"   logo_templatecolorsr#   r#   r$   log_version_and_model  s   
r   BadRequestErrorerr_typestatus_codeparamc                 C   s  d }t | tru| }ddlm}m} t ||rd}tj}|j}nRt ||r,d}tj}d }nEt |t	t
tfr<d}tj}d }n5t |trId}tj}d }n(t |trVd}|j}d }ntdd t|jD rjd}tj}d }nd}tj}d }t|} ttt| ||j|d	d
S )Nr   )VLLMNotFoundErrorVLLMValidationErrorr   NotFoundErrorNotImplementedErrorInternalServerErrorc                 s   s    | ]}|j d kV  qdS )TemplateErrorN)__name__)rW   clsr#   r#   r$   rY   N  s    z(create_error_response.<locals>.<genexpr>)r"   r   coder   )error)rB   rA   vllm.exceptionsr   r   r   BAD_REQUEST	parameter	NOT_FOUNDr@   ri   OverflowErrorr   NOT_IMPLEMENTEDr   r   anyr   __mro__INTERNAL_SERVER_ERRORstrr   r   r   value)r"   r   r   r   excr   r   r#   r#   r$   create_error_response,  sP   




r   rV   )?r*   rf   r;   rL   argparser   httpr   loggingr   stringr   regexr   fastapir   fastapi.responsesr   r   starlette.backgroundr	   r
   vllmr   vllm.engine.arg_utilsr   'vllm.entrypoints.openai.engine.protocolr   r   r   r   'vllm.entrypoints.openai.models.protocolr   vllm.loggerr   r   vllm.platformsr   vllm.utils.argparse_utilsr   r   rN   VLLM_SUBCMD_PARSER_EPILOGr%   r=   r>   rI   rP   intr   r_   rs   rw   tuplerz   listr   r   r   r   r   rA   r   r#   r#   r#   r$   <module>   s   -0
 


