
    #j                         d Z ddlZddlZddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ  ej        e          Z G d d	          Zd
e_         dS )z+
CLI entry point for `transformers serve`.
    N)	Annotated)logging)is_serve_available   )set_torch_seedc            .       |   e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d8deedz   ej        d	
          f         dee ej        d
          f         dee	dz   ej        d
          f         dee	dz   ej        d
          f         dee	dz   ej        d
          f         dee
dz   ej        d
          f         deedz   ej        d
          f         deedz   ej        d
          f         dee ej        d
          f         deedz   ej        d
          f         dee ej        d
          f         deedz   ej        d 
          f         d!ee ej        d"
          f         d#ee	 ej        d$
          f         d%ee ej        d&
          f         d'ee	 ej        d(
          f         d)ee ej        d*
          f         d+ee ej        d,
          f         d-ee	dz   ej        d.
          f         d/ee ej        d0d12          f         d3df*d4Zd5 Zd6 Zd7 ZdS )9ServeNFauto,  	localhost@  warningforce_modelz*Model to preload and use for all requests.)helpcontinuous_batchingzMEnable continuous batching with paged attention. Configure with --cb-* flags.cb_block_sizez6KV cache block size in tokens for continuous batching.cb_num_blocksz2Number of KV cache blocks for continuous batching.cb_max_batch_tokensz1Maximum tokens per batch for continuous batching.cb_max_memory_percentz/Max GPU memory fraction for KV cache (0.0-1.0).cb_use_cuda_graphz+Enable CUDA graphs for continuous batching.attn_implementationz2Attention implementation (e.g. flash_attention_2).compilez*Enable torch.compile for faster inference.quantizationz.Quantization method: 'bnb-4bit' or 'bnb-8bit'.devicez4Device for inference (e.g. 'auto', 'cuda:0', 'cpu').dtypez2Override model dtype. 'auto' derives from weights.trust_remote_codezTrust remote code when loading.model_timeoutzGSeconds before idle model is unloaded. Ignored when force_model is set.hostzServer listen address.portzServer listen port.enable_corszEnable permissive CORS.	log_levelz'Logging level (e.g. 'info', 'warning').default_seedzDefault torch seed.non_blockingTz1Run server in a background thread. Used by tests.)hiddenr   returnc           	         t                      st          d          dd l}ddlm} ddlm} ddlm} ddl	m
} ddlm} dd	lm} dd
lm} |t#          |           t%          j        d          }|                    t$          j        |                                                     ||||||
||          | _        ddlm} d |||||d                                D             }|r |di |nd }  |||	|           | _         || j        | j                  | _         || j        | j                  | _         || j        | j                  | _         || j        | j                  | _         || j        | j        | j        | j        | j        |          }!|                     |!||d          }"|!                    |"          | _"        |r| #                                 d S | j"        $                                 d S )NzRMissing dependencies for serving. Install with `pip install transformers[serving]`r   r   )ChatCompletionHandler)CompletionHandler)ModelManager)ResponseHandler)build_server)TranscriptionHandler)GenerationStatetransformers)r   r   r   r   r   r   r   )ContinuousBatchingConfigc                     i | ]
\  }}|||S N ).0kvs      `/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/transformers/cli/serve.py
<dictcomp>z"Serve.__init__.<locals>.<dictcomp>p   s/     

 

 

1 } q }}    )
block_size
num_blocksmax_batch_tokensmax_memory_percentuse_cuda_graph)r   r   	cb_config)model_managergeneration_state)completion_handlerresponse_handlertranscription_handlerr    info)r   r   r!   r2   )%r   ImportErroruvicornserving.chat_completionr'   serving.completionr(   serving.model_managerr)   serving.responser*   serving.serverr+   serving.transcriptionr,   serving.utilsr-   r   r   
get_loggersetLevel
log_levelslower_model_managerr.   r/   items_generation_state_chat_handler_completion_handler_response_handler_transcription_handlerConfigServerserverstart_serverrun)#selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   rF   r'   r(   r)   r*   r+   r,   r-   transformers_loggerr/   	cb_kwargsr>   appconfigs#                                      r6   __init__zServe.__init__"   s   \ "## 	trsssBBBBBB999999777777555555000000??????222222 #<((( &0@@$$W%7	8I8I%JKKK*l/ 3%'#
 
 
 	:99999

 

 ,+$7&;"3  egg

 

 

	 >GP,,99y999D	!0 3"
 "
 "
 32-!3
 
 

 $5#4-!3$
 $
 $
 
 "1-!3"
 "
 "

 ';&:4;NPTPf&g&g#l#7!3"&"=#
 
 
 $TVLLnnV,, 	KOOr8   c                 |      fd}t          j        |dd           _         j                                         d S )Nc                      t          j                    } t          j        |            |                     j                                                   d S r1   )asyncionew_event_loopset_event_looprun_until_completer[   serve)loopr^   s    r6   _runz Serve.start_server.<locals>._run   sJ    )++D"4(((##DK$5$5$7$788888r8   zuvicorn-threadF)targetnamedaemon)	threadingThread_threadstart)r^   rl   s   ` r6   r\   zServe.start_server   sS    	9 	9 	9 	9 	9
 !'t:JSXYYYr8   c                 8    | j                                          dS )z$Clear all loaded models from memory.N)rR   shutdownr^   s    r6   reset_loaded_modelszServe.reset_loaded_models   s    $$&&&&&r8   c                     | j                                          | j                                         | j        r| j                                        sd S d| j        _        | j                            d           d S )NT   )timeout)rT   ru   rR   rr   is_aliver[   should_exitjoinrv   s    r6   kill_serverzServe.kill_server   sy    '')))$$&&&| 	4<#8#8#:#: 	F"&!$$$$$r8   )NFNNNNNNFNr
   r
   Fr   r   r   Fr   NF)__name__
__module____qualname__r   strtyperArgumentboolOptionintfloatrc   r\   rw   r~   r2   r8   r6   r	   r	   !   s         qu
       di lrpvch LWIMUZbkX\ Y@ @sTz>5>?k+l+l+llm@ 'ELmnnnp
	@ !$J*bcccc
@ !$J*^____
@ '$J*]^^^^
@"  )DL,%,,]^^^^ 
#@( %4K+XYYYY
)@. '$J*^____
/@4 43_!`!`!``a5@6  $J*Z[[[[
7@< #|u|1ghhhhi=@> t\U\7k%l%l%llm?@@ %T<5<=^+_+_+_%_`A@B !#lmmmm
C@J \U\/GHHHHIK@L \U\/DEEEEFM@N t\U\7P%Q%Q%QQRO@P S,%,4]"^"^"^^_Q@R  d
LEL>S,T,T,T TUS@T  ,%,d1deeee
U@Z 
[@ @ @ @D  ' ' '% % % % %r8   r	   u  
Run a FastAPI server to serve models on-demand with an OpenAI compatible API.
Models will be loaded and unloaded automatically based on usage and a timeout.


Endpoints:
    POST /v1/chat/completions — Chat completions (streaming + non-streaming).
    POST /v1/completions      — Legacy text completions from a prompt.
    GET  /v1/models           — Lists available models.
    GET  /health              — Health check.

Requires FastAPI and Uvicorn: pip install transformers[serving]
)__doc__rf   rp   typingr   r   transformers.utilsr   transformers.utils.import_utilsr   rM   r   rN   r   loggerr	   r2   r8   r6   <module>r      s                 & & & & & & > > > > > > ) ) ) ) ) ) 
	H	%	%V% V% V% V% V% V% V% V%rr8   