o
    :/i                     @   s  d dl mZmZ d dlmZmZmZ d dlmZ d dl	m
Z
mZ d dlmZmZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' e
rxd dl$m(Z( eG dd dZ)G dd deZ*dS )    )ABCabstractmethod)AsyncGeneratorIterableMapping)	dataclass)TYPE_CHECKINGAny)ModelConfig
VllmConfig)WeightTransferInitRequestWeightTransferUpdateRequest)ProcessorInputs
PromptType)LoRARequest)PoolingRequestOutputRequestOutput)IOProcessor)PoolingParams)BaseRenderer)SamplingParams)SupportedTask)EngineCoreRequest)InputProcessor)	PauseModec                   @   s*   e Zd ZU dZeed< dZedB ed< dS )StreamingInputzInput data for a streaming generation request.

    This is used with generate() to support multi-turn streaming sessions
    where inputs are provided via an async generator.
    promptNsampling_params)__name__
__module____qualname____doc__r   __annotations__r   r    r#   r#   a/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/engine/protocol.pyr      s   
 r   c                   @   sx  e Zd ZU dZeed< eed< eed< edB ed< e	ed< e
edefd	d
Ze
edefddZe
edefddZe
edefddZedddddddddeeB eB eedf B dedededB dedB deeef dB deeef dB dededB dedB deedf fddZe					dfdeeB de dededB deeef dB dedeeef dB dedB dee!df fd d!Z"edee#e B ddfd"d#Z$edefd$d%Z%edgd&d'Z&edgd(d)Z'edgd*d+Z(edgd,d-Z)edgd.d/Z*edgd0d1Z+e	2dhd3ed4edefd5d6Z,edid9ed:d;ddfd<d=Z-edjd>e.e dB ddfd?d@Z/edefdAdBZ0ededefdCdDZ1ed8d2dEdFd:d;dGedHeddfdIdJZ2edgdKdLZ3edefdMdNZ4edjdOe5dB ddfdPdQZ6	RdkdSedTeddfdUdVZ7		W	dldXedOe5dB dYe8dZedB fd[d\Z9de8e:d]f fd^d_Z;d`e<ddfdadbZ=dce>ddfdddeZ?dS )mEngineClientz$Protocol class for Clients to Enginevllm_configmodel_configrendererNio_processorinput_processorreturnc                 C      d S Nr#   selfr#   r#   r$   
is_running2      zEngineClient.is_runningc                 C   r,   r-   r#   r.   r#   r#   r$   
is_stopped6   r1   zEngineClient.is_stoppedc                 C   r,   r-   r#   r.   r#   r#   r$   errored:   r1   zEngineClient.erroredc                 C   r,   r-   r#   r.   r#   r#   r$   
dead_error>   r1   zEngineClient.dead_errorr   )prompt_textlora_requesttokenization_kwargstrace_headersprioritydata_parallel_rankreasoning_endedr   r   
request_idr5   r6   r7   r8   r9   r:   r;   c                C      dS )zGenerate outputs for a request.Nr#   )r/   r   r   r<   r5   r6   r7   r8   r9   r:   r;   r#   r#   r$   generateB   s   zEngineClient.generatepooling_paramsc	           	      C   r=   )z4Generate outputs for a request from a pooling model.Nr#   )	r/   r   r?   r<   r6   r8   r9   r7   r;   r#   r#   r$   encodeW   s   zEngineClient.encodec                       dS )zAbort a request.

        Args:
            request_id: The unique id of the request,
                        or an iterable of such ids.
        Nr#   )r/   r<   r#   r#   r$   abortf      zEngineClient.abortc                       d S r-   r#   r.   r#   r#   r$   is_tracing_enabledp      zEngineClient.is_tracing_enabledc                    rD   r-   r#   r.   r#   r#   r$   do_log_statss   rF   zEngineClient.do_log_statsc                    rA   )zRaise if unhealthyNr#   r.   r#   r#   r$   check_healthv      zEngineClient.check_healthc                    rA   )zStart profiling the engineNr#   r.   r#   r#   r$   start_profile{   rI   zEngineClient.start_profilec                    rA   )zStop profiling the engineNr#   r.   r#   r#   r$   stop_profile   rI   zEngineClient.stop_profilec                    rA   )zReset the multi-modal cacheNr#   r.   r#   r#   r$   reset_mm_cache   rI   zEngineClient.reset_mm_cachec                    rA   )zReset the encoder cacheNr#   r.   r#   r#   r$   reset_encoder_cache   rI   z EngineClient.reset_encoder_cacheFreset_running_requestsreset_connectorc                    rA   )zDReset the prefix cache and optionally any configured connector cacheNr#   )r/   rN   rO   r#   r#   r$   reset_prefix_cache   s   zEngineClient.reset_prefix_cache   rB   levelmoder   c                    rA   )zSleep the engineNr#   )r/   rR   rS   r#   r#   r$   sleep   rI   zEngineClient.sleeptagsc                    rA   )zWake up the engineNr#   )r/   rU   r#   r#   r$   wake_up   rI   zEngineClient.wake_upc                    rA   )z$Check whether the engine is sleepingNr#   r.   r#   r#   r$   is_sleeping   rI   zEngineClient.is_sleepingc                    rA   )z<Load a new LoRA adapter into the engine for future requests.Nr#   )r/   r6   r#   r#   r$   add_lora   rI   zEngineClient.add_loraT)rS   wait_for_inflight_requestsclear_cacherY   rZ   c                   rA   )a~  Pause new generation/encoding requests.

        Args:
            mode: How to handle in-flight requests:
                - ``"abort"``: Abort all in-flight requests immediately
                  and return partial results with "abort" reason (default).
                - ``"wait"``: Wait for in-flight requests to complete.
                - ``"keep"``: Freeze requests in queue; they resume on
                  :meth:`resume_generation`.
            wait_for_inflight_requests: DEPRECATED. Use ``mode="wait"`` instead.
            clear_cache: DEPRECATED. Whether to clear KV and prefix caches
                after draining.
        Nr#   )r/   rS   rY   rZ   r#   r#   r$   pause_generation   s   zEngineClient.pause_generationc                    rA   )z.Resume accepting generation/encoding requests.Nr#   r.   r#   r#   r$   resume_generation   rI   zEngineClient.resume_generationc                    rA   )z.Return whether the engine is currently paused.Nr#   r.   r#   r#   r$   	is_paused   rI   zEngineClient.is_pausedtimeoutc                 C   r=   )z*Shutdown the engine with optional timeout.Nr#   )r/   r^   r#   r#   r$   shutdown   s   zEngineClient.shutdown,  new_data_parallel_sizedrain_timeoutc                       t )zScale the engineNotImplementedError)r/   ra   rb   r#   r#   r$   scale_elastic_ep      zEngineClient.scale_elastic_epr#   methodargskwargsc                    rc   )z0Perform a collective RPC call to the given path.rd   )r/   rh   r^   ri   rj   r#   r#   r$   collective_rpc   rC   zEngineClient.collective_rpc.c                    rc   )zGet supported tasksrd   r.   r#   r#   r$   get_supported_tasks      z EngineClient.get_supported_tasksinit_requestc                    rc   )z+Initialize weight transfer for RL training.rd   )r/   rn   r#   r#   r$   init_weight_transfer_engine   rg   z(EngineClient.init_weight_transfer_enginerequestc                    rc   )z&Batched weight update for RL training.rd   )r/   rp   r#   r#   r$   update_weights   rm   zEngineClient.update_weights)NNr   NN)r+   N)FF)rQ   rB   r-   )r`   )Nr#   N)@r   r   r    r!   r   r"   r
   r   r   r   propertyr   boolr0   r2   r3   BaseExceptionr4   r   r   r   r   r   r   strr   dictr	   r   intr   r>   r   r   r@   r   rB   rE   rG   rH   rJ   rK   rL   rM   rP   rT   listrV   rW   rX   r[   r\   r]   floatr_   rf   tuplerk   r   rl   r   ro   r   rq   r#   r#   r#   r$   r%   )   s@  
 

	

	

	
	


r%   N)+abcr   r   collections.abcr   r   r   dataclassesr   typingr   r	   vllm.configr
   r   %vllm.distributed.weight_transfer.baser   r   vllm.inputs.datar   r   vllm.lora.requestr   vllm.outputsr   r   vllm.plugins.io_processorsr   vllm.pooling_paramsr   vllm.renderersr   vllm.sampling_paramsr   
vllm.tasksr   vllm.v1.enginer   vllm.v1.engine.input_processorr   r   r   r%   r#   r#   r#   r$   <module>   s*   