o
    ~riHn                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZmZ d dlmZ d dlmZ d dlmZ d dl	mZ d dlmZmZmZmZmZ d dlZd dlmZ d d	lm Z  d d
l!m!Z! d d
l"m!Z# d dl$Z$d dl$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 erd dl$m5Z5m6Z6m7Z7 ddl8m9Z9 e0 rd dl:Z:e4 rd dl;m<Z< e2 oe/ oe3 oe1 Z=e=rd dl>Z>d dl?m@Z@mAZA d dlBmCZC d dlDmEZEmFZF d dlGmHZH d dlImJZJ d dlKmLZLmMZMmNZN d dlOmPZP d dlQmRZRmSZSmTZTmUZU d dlQmPZV d dlWmXZX d dlYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZh d dlimjZj d dlkmlZlmmZmmnZn G dd  d ejd!d"ZoG d#d$ d$eXd!d"ZpG d%d& d&eJd!d"ZqemeoZremepZsemeqZth d'Zuh d(Zvh d)Zwe-xeyZzd*d+d,d-iZ{e|e{} Z~d.Zd/d0 Zd1d2 Zd3d4 ZG d5d6 d6ejZd7ed8e'd9e'fd:d;ZG d<d= d=ZG d>d? d?ZG d@dA dAZdBdC ZG dDdE dEZG dFdG dGZG dHdI dIZdJe_eydKkr e ZdS dS )L    N)Callable	GeneratorIterable)asynccontextmanager)	lru_cache)BytesIO)Thread)TYPE_CHECKING	AnnotatedOptional	TypedDictUnion)scan_cache_dir)DecodeStream)tqdm)AutoTokenizerBitsAndBytesConfigGenerationConfigPreTrainedTokenizerBase)LogitsProcessorListTextIteratorStreamer)logging)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available)PreTrainedModelPreTrainedTokenizerFastProcessorMixin   )ContinuousBatchingManager)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionChatCompletionMessageChatCompletionMessageParam)Choice)ChatCompletionChunkChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                   @      e Zd ZU dZeed< dS ))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__ rQ   rQ   d/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/transformers/cli/serve.pyrH   s      
 rH   F)totalc                   @   rG   )+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id
        rI   NrJ   rQ   rQ   rQ   rR   rU   z   rS   rU   c                   @   s.   e Zd ZU dZeed< eed< dZeed< dS )%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerI   FstreamN)	rK   rL   rM   rN   bytesrP   rO   rX   boolrQ   rQ   rQ   rR   rV      s
   
 rV   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstopr\   audior]   logprobsmetadata	functions
modalities
predictionrc   rd   re   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   r^   r_   languagerr   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendzx-request-idc                 C   s   dd l }||  d S Nr   )torchmanual_seed)_seedr   rQ   rQ   rR   set_torch_seed   s   r   c                  C   s$   dd l } | j r| j  d S d S r~   )r   cudais_availableempty_cache)r   rQ   rQ   rR   reset_torch_cache   s   
r   c                 C   s   dd l }|| S r~   )r   	ones_like)_input_tensorr   rQ   rQ   rR   torch_ones_like   s   
r   c                   @   s   e Zd ZdZdZdZdZdS )ModalityLLMVLMSTTTTSN)rK   rL   rM   r   r   r   r   rQ   rQ   rQ   rR   r      s
    r   reqmodel_generation_configreturnc                 K   sX  |  ddurtdi t| d }nt|}|jdi |}| D ]\}}|dur3t||| q%|  ddurBt	| d |_
|  ddurPt	| d |_
|  ddur^t| d |_|  ddurj| d |_|  ddurv| d |_|  ddurt| d |_t| d d	krd
|_|  ddurt| d |_|  ddurt| d  |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rI   Nmax_output_tokens
max_tokensfrequency_penalty
logit_biasri   temperatureg        Ftop_pseedrQ   )getr   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   r   )r   r   kwargsrI   non_standard_kwargskvrQ   rQ   rR   !create_generation_config_from_req   s6   


r   c                   @   sr   e Zd ZdZdeegdf defddZdededB fd	d
Z	dedededB fddZ
defddZdd ZdS )DownloadAggregatoraC  Aggregates byte-progress across multiple concurrent download tqdm bars into a single SSE stream.

    huggingface_hub opens one tqdm bar per file shard. This class tracks them all and emits
    a single aggregate ``{"stage": "download", "progress": {"current": ..., "total": ...}}``
    event whenever any bar updates.
    enqueueNmodel_id_and_revisionc                 C   s   || _ || _i | _d | _d S N)r   modelbarslast_emitted_current)selfr   r   rQ   rQ   rR   __init__.  s   
zDownloadAggregator.__init__bar_idrT   c                 C   s   d|f| j |< |   d S r~   r   _emit)r   r   rT   rQ   rQ   rR   register4     zDownloadAggregator.registercurrentc                 C   s   ||f| j |< |   d S r   r   )r   r   r   rT   rQ   rQ   rR   r   8  r   zDownloadAggregator.updatec                 C   s   d S r   rQ   )r   r   rQ   rQ   rR   close<  s   zDownloadAggregator.closec                 C   sp   t dd | j D }|| jkrd S || _dd | j D }|r&t |nd }| d| jd||dd d S )	Nc                 s   s    | ]\}}|V  qd S r   rQ   ).0c_rQ   rQ   rR   	<genexpr>@  s    z+DownloadAggregator._emit.<locals>.<genexpr>c                 S   s   g | ]
\}}|d ur|qS r   rQ   )r   r   trQ   rQ   rR   
<listcomp>D  s    z,DownloadAggregator._emit.<locals>.<listcomp>loadingdownloadr   rT   statusr   stageprogress)sumr   valuesr   r   r   )r   agg_currenttotals	agg_totalrQ   rQ   rR   r   ?  s   
zDownloadAggregator._emit)rK   rL   rM   rN   r   dictrO   r   r   r   r   r   r   rQ   rQ   rQ   rR   r   &  s    r   c                   @   sJ   e Zd ZdZdd Zdd ZdddZd	d
 Zdd Zdd Z	dd Z
dS )DownloadProxyzT
    Leverages the DownloadAggregator in order to have a coherent tqdm wrapper.
    c                 C   s(   || _ t|| _|| _d| _|j| _d S r~   )wrapped_baridr   download_aggregatorrh   rT   )r   r   r   rQ   rQ   rR   r   U  s
   
zDownloadProxy.__init__c                 C      t | j|S r   getattrr   r   namerQ   rQ   rR   __getattr__]     zDownloadProxy.__getattr__   c                 C   sF   |d u rd}|  j |7  _ | j| j| j t| jd| j | j|S )Nr   rT   )rh   r   r   r   r   r   rT   r   rh   rQ   rQ   rR   r   `  s
    zDownloadProxy.updatec                 C   s   | j | j | j S r   )r   r   r   r   r   rQ   rQ   rR   r   i  s   
zDownloadProxy.closec                 C      | j   | S r   r   	__enter__r   rQ   rQ   rR   r   m     
zDownloadProxy.__enter__c                 G      | j j| S r   r   __exit__r   arQ   rQ   rR   r   q  r   zDownloadProxy.__exit__c              	   c   sB    d}| j D ]}|d7 }| j| j|t| j d| j |V  qd S )Nr   r   rT   )r   r   r   r   r   rT   )r   countitemrQ   rQ   rR   __iter__t  s   
zDownloadProxy.__iter__Nr   )rK   rL   rM   rN   r   r   r   r   r   r   r   rQ   rQ   rQ   rR   r   P  s    
	r   c                   @   sR   e Zd ZdZdd Zdd Zdd Zdd	d
Zdd Zdd Z	dd Z
dd ZdS )WeightsProxyzf
    Wraps the weight-loading tqdm bar to have finer control over how we emit them to the clinet.
    c                 C   s*   || _ d| _|| _|| _d| _|j| _d S )Nr   )r   last_emittedcallabler   rh   rT   )r   r   	_callabler   rQ   rQ   rR   r     s   zWeightsProxy.__init__c                 C   r   r   r   r   rQ   rQ   rR   r     r   zWeightsProxy.__getattr__c              
   C   sF   | j | jkrd S | j | _| d| jd| j t| jd| jdd d S )Nr   weightsrT   r   r   )rh   r   r   r   r   r   rT   r   rQ   rQ   rR   r     s   zWeightsProxy._emitr   c                 C   s.   |d u rd}|  j |7  _ |   | j|S Nr   )rh   r   r   r   r   rQ   rQ   rR   r     s
   zWeightsProxy.updatec                 C   s
   | j  S r   )r   r   r   rQ   rQ   rR   r     s   
zWeightsProxy.closec                 C   r   r   r   r   rQ   rQ   rR   r     r   zWeightsProxy.__enter__c                 G   r   r   r   r   rQ   rQ   rR   r     r   zWeightsProxy.__exit__c                 c   s.    | j D ]}|  jd7  _|   |V  qd S r   )r   rh   r   )r   r   rQ   rQ   rR   r     s   
zWeightsProxy.__iter__Nr   )rK   rL   rM   rN   r   r   r   r   r   r   r   r   rQ   rQ   rQ   rR   r   }  s    	
	r   c                    s&   t  G  fdddt}|S )Nc                       sB   e Zd ZdZ fddZd	fdd	Z fddZ  ZS )
z$set_tqdm_class.<locals>.ProgressTqdmuE  tqdm subclass that routes progress to the correct SSE stage.

        Bars with ``unit="B"`` are download bars (one per file shard) — they are
        aggregated into a single ``download`` stage stream via ``_DownloadAggregator``.
        All other bars are weight-loading bars emitted as ``weights`` stage events.
        c                    sb   | dpd| _d|d< t j|i | d| _d| _| jdkr/t| | _| j| j	 d S d S )NunititTdisabler   r   B)
r   sse_unitsuperr   rh   r   r   _bar_idr   rT   )r   argsr   	__class__r   rQ   rR   r     s   

z-set_tqdm_class.<locals>.ProgressTqdm.__init__r   c                    st   |d u rd}|  j |7  _ | jdkr| j| j | j d S | j | jkr8| j | _ dd| j | jdd d S d S )Nr   r   r   r   r   r   )rh   r   r   r   rT   r   r   callbackr   midrQ   rR   r     s   
z+set_tqdm_class.<locals>.ProgressTqdm.updatec                    s$   | j dkr| j t   d S )Nr   )r   r   r   r   r   r   rQ   rR   r     s   
z*set_tqdm_class.<locals>.ProgressTqdm.closer   )rK   rL   rM   rN   r   r   r   __classcell__rQ   r   )r   rR   ProgressTqdm  s
    
r   )r   	base_tqdm)r   r   r   rQ   r   rR   set_tqdm_class  s   
+r  c                   @   s    e Zd ZdZdd Zdd ZdS )	ToolStatez7Lightweight class to keep track of the tool call state.c                 C   s   |    d S r   )resetr   rQ   rQ   rR   r     r   zToolState.__init__c                 C   s   d| _ d| _d| _d| _dS )z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   rQ   rQ   rR   r    s   
zToolState.resetN)rK   rL   rM   rN   r   r  rQ   rQ   rQ   rR   r    s    r  c                   @   sR   e Zd ZdZ	ddddeded dB fdd	Zd
d Zdd Zdd Z	dd Z
dS )
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nr   r   timeout_seconds	processor)r    r   c                 C   s>   || _ t|j| _|| _|| _t| j| j| _	| j	
  d S r   )r   rO   name_or_path_name_or_pathr  r  	threadingTimertimeout_reached_timerr|   )r   r   r  r  rQ   rQ   rR   r     s   zTimedModel.__init__c                 C   s*   | j   t| j| j| _ | j   dS )z2Reset the timer for the deletion of the instances.N)r  cancelr  r  r  r  r|   r   rQ   rQ   rR   reset_timer  s   
zTimedModel.reset_timerc                 C   sL   t | dr"| jdur$| `| `d| _d| _t  t  | j  dS dS dS )z>Delete the wrapped model and processor and clean up resources.r   N)hasattrr   r  gccollectr   r  r  r   rQ   rQ   rR   delete_model  s   zTimedModel.delete_modelc                 C   s4   | j dkr|   t| j d| j  d d S d S )Nr   z was removed from memory after z seconds of inactivity)r  r  loggerwarningr  r   rQ   rQ   rR   r  !  s   
zTimedModel.timeout_reachedc                 C   s   t | d p
| jdu S )z)Check if the instances have been deleted.r   N)r  r   r   rQ   rQ   rR   
is_deleted(  s   zTimedModel.is_deletedr   )rK   rL   rM   rN   r   r   r   r  r  r  r  rQ   rQ   rQ   rR   r
    s    	

r
  c                $   @   s  e Zd Z															dwdeedB ejd	d
f deeejdd
f deedB ejdd
f deeejdd
f deedB ejdd
f deedB ejdd
f deeejdd
f deeejdd
f deeejdd
f deeejdd
f deedB ejdd
f deeejd d
f d!eeejd"d
f d#eedB ejd$d
f d%eeejd&d'd(f d)df d*d+Z	d,d- Z
d.d/ Zd0d1 Zd2ed3ed4d5d6efd7d8Zd2efd9d:Zd2efd;d<Zd2efd=d>Z	?							dxd@edAedB dBedB dCedB dDedB dEedF dB dGedB dHedI d)dJfdKdLZedMdNd)efdOdPZeedydQedB d)eeeef  fdRdSZdTed@ed)dUfdVdWZedydBdXd)efdYdZZed[efd\d]Z dTed)dUfd^d_Z!dTed)e"eddf fd`daZ#dTed)efdbdcZ$dTed)e"eddf fdddeZ%dTed)efdfdgZ&d)e'dB fdhdiZ(djed)efdkdlZ)	dydmedne*egdf dB fdodpZ+	dydmedne*egdf dB d)e,dq fdrdsZ-dmed)e,dt fdudvZ.dS )zServeNautoF	localhost@  ,  r  continuous_batchingz8Whether to use continuous batching for chat completions.)helpdevicezgDevice to use for inference; will default to `auto` and place the model on an accelerator if available.dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.trust_remote_codez2Whether to trust remote code when loading a model.attn_implementationz&Which attention implementation to use.quantizationzAWhich quantization method to use. choices: 'bnb-4bit', 'bnb-8bit'hostz$Interface the server will listen to.portzPort the server will listen to.model_timeoutz@Time in seconds after which a model will be removed from memory.	log_levelz8Logging level as a string. Example: 'info' or 'warning'.default_seedz1The default seed for torch, should be an integer.enable_corsztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.input_validationz+Whether to turn on strict input validation.force_modelzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.non_blockingTz/Whether to run the server in a separate thread.)hiddenr"  r   c                    s  t std| _| _| _| _| _| _| _| _	|	 _
|
 _| _| _| _| _| _|d ur;t| td}|tj|
   td}|tj|
   i  _d  _i  _i  _i  _t  _d  _d  _ d  _! j
d u r jrdnd _
 jr " j}| _! #| t$dt%f fdd}t%|d	} jr|j&t'd
gdd
gd
gd t()d ddl*m+} |,dd|dt-f fdd}|,ddt-f fdd}|,dd|f fdd}|.d|/d fdd}|/ddd  }|,d!dt-f fd"d#}|0d$d|fd%d&}t1j2| j j	d'd(}t13| _4 jr9 5  d S  j46  d S ))NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`transformersz+transformers.generation.continuous_batchingr   r   appc                   s   d V      d S r   )reset_loaded_models)r3  r   rQ   rR   lifespan  s   z Serve.__init__.<locals>.lifespan)r5  *T)allow_originsallow_credentialsallow_methodsallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.r   )Requestz/v1/chat/completionsrequestbodyc                    sH    j |d td|d  d j   jr || jjS  |S )Nr<  [Request received] Model: r   , CB: ) validate_chat_completion_requestr  r  r!  #continuous_batching_chat_completionstate
request_idgenerate_chat_completion)r<  r=  r   rQ   rR   chat_completion  s
   
z'Serve.__init__.<locals>.chat_completionz/v1/responsesc                    s`    j | d td| d  d j  | dd}|s% | }t|S  | }t|ddS )	Nr>  r?  r   r@  rX   Ttext/event-stream
media_type)	validate_response_requestr  r  r!  r   generate_response_non_streamingr'   generate_responser(   )r<  rX   response_objoutputr   rQ   rR   	responses  s   

z!Serve.__init__.<locals>.responsesz/v1/audio/transcriptionsc              
      s   |   4 I d H 5}t|d  I d H |d d}td|d j d|d j d|d jd dd	 W d   I d H  n1 I d H sDw   Y   j|d
  	|}t
|ddS )NrW   r   )rW   r   zReceived file: z; MIME type: z; size:    z.2fz KiBr>  rG  rH  )formrV   readr  debugfilenamecontent_typesizevalidate_transcription_requestgenerate_transcriptionr(   )r<  rQ  parsed_requestrN  r   rQ   rR   audio_transcriptions  s   (

z,Serve.__init__.<locals>.audio_transcriptionsz
/v1/modelsc                      s   t d  dS )Nlist)objectdata)r'   get_gen_modelsrQ   r   rQ   rR   get_all_models  s   z&Serve.__init__.<locals>.get_all_modelsz/healthc                   S   s   t ddiS )Nr   ok)r'   rQ   rQ   rQ   rR   healthcheck  s   z#Serve.__init__.<locals>.healthcheckz/load_modelc                    sF   |  d}|d u rtddd|  fdd}t| ddS )	Nr     z*Missing `model` field in the request body.status_codedetailc                    s,  t  } jv }|oj   }|r/|r/j   dtddd dV  d S jv rOj 	|  	 | 
 I d H }|d u rK	 d S |V  q=| gj< t  dtffddt  fd	d
fdd}t | j< 	 | 
 I d H }|d u rd S |V  q)Ndata: readyTr   r   cached

payloadc                    s0   dt |  d  fdd}| d S )Nrf  rj  c                     s"   j  g D ]} |  qd S r   )loading_subscribersr   
put_nowaitq)r   msgr   rQ   rR   	broadcast  s   z_Serve.__init__.<locals>.load_model.<locals>.event_publisher.<locals>.enqueue.<locals>.broadcast)r   dumpscall_soon_threadsafe)rk  rq  )loopr   r   )rp  rR   r     s   zLServe.__init__.<locals>.load_model.<locals>.event_publisher.<locals>.enqueuec                    s   | |i |}| dpt|dd pd}| dpt|dd}t|d| d}|dkr: t|| t| dS |dkrDt|S |S )	Ndescr  r   r   rT   r   )r   zLoading weights)r   r   r   r   r   r   )factoryr   r   barru  r   rT   )r   r   r   rQ   rR   streaming_tqdm_hook  s   zXServe.__init__.<locals>.load_model.<locals>.event_publisher.<locals>.streaming_tqdm_hookc               
      s   t } z_ztj I d H  W n) ty= } ztjd d| dd  dt|d W Y d }~nd }~ww W t |  fdd}	| d S W t |  fdd}	| d S t |  fdd}	| w )	NzFailed to load z: T)exc_infoerror)r   r   messagec                     s0   j  g D ]} | d  qj d  d S r   )rl  poprm  loading_tasksrn  r   r   rQ   rR   _send_sentinel;  s   zeServe.__init__.<locals>.load_model.<locals>.event_publisher.<locals>.run_load.<locals>._send_sentinel)
r   set_tqdm_hookasyncio	to_threadload_model_and_processor	Exceptionr  rz  rO   rs  )previous_hooker  )r   rt  r   r   rx  rQ   rR   run_load1  s(   
 


zMServe.__init__.<locals>.load_model.<locals>.event_publisher.<locals>.run_load)r  Queueloaded_modelsr  r  r   rr  r}  rl  appendr   get_running_loopr   r   create_task)queuemodel_loadedmodel_not_deletedr   r  r~  )r   r   rt  rx  rR   event_publisher  s<   


	z;Serve.__init__.<locals>.load_model.<locals>.event_publisherrG  rH  )r   r%   process_model_namer(   )r=  r   r  r   )r   rR   
load_model  s   

Pz"Serve.__init__.<locals>.load_modelhttpc                    s>   | j tptt }|| j_|| I d H }||j t< |S r   )headersr   X_REQUEST_IDrO   uuiduuid4rC  rD  )r<  	call_nextrD  responserQ   rQ   rR   get_or_set_request_idL  s   
z-Serve.__init__.<locals>.get_or_set_request_idinfo)r(  r)  r+  )7serve_dependencies_availableImportErrorr&  r!  r#  r'  r$  r%  r(  r)  r*  r+  r,  r-  r.  r/  r0  r   r   
get_loggersetLevel
log_levelslowerr  #running_continuous_batching_managerrl  r}  model_locksr  Lockmodel_locks_guardlast_messageslast_kv_cache
last_modelr  r  r   r$   add_middlewarer&   r  warning_oncefastapir;  postr   optionsr   
middlewareuvicornConfigServerserverstart_serverrun)r   r!  r#  r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r.  r/  r0  transformers_logger	cb_loggerr   r5  r3  r;  rF  rO  rZ  r_  ra  r  r  configrQ   r   rR   r   0  s   9







YzServe.__init__c                    s,    fdd}t j|ddd _ j  d S )Nc                      s,   t   _t  j  j j  d S r   )r  new_event_loop_loopset_event_looprun_until_completer  serverQ   r   rQ   rR   _run]  s   
z Serve.start_server.<locals>._runzuvicorn-threadF)targetr   daemon)r  r   _threadr|   )r   r  rQ   r   rR   r  \  s   zServe.start_serverc                 C   sR   | j std| j  stdd| j_| j r%| j  r'| j jdd d S d S d S )NzHThe server cannot be killed as it was not launched in a separate thread.zThe server is already killed.Tr!   )timeout)r  
ValueErroris_aliver  should_exitjoinr   rQ   rQ   rR   kill_serverf  s   
zServe.kill_serverc                 C   sP   | j durtd | j jddd d| _ t| j D ]}|  qd| _dS )z+
        Resets all loaded models.
        Nz*Resetting the continuous batching manager.Tr!   blockr  )	r  r  r  ri   r[  r  r   r  r  )r   r   rQ   rQ   rR   r4  q  s   



zServe.reset_loaded_modelsr<  schema	validatorrE   unused_fieldsc           
   
   C   s   t d|  t| }|j}|| }|r(t d|  tdd| d| jrhz|| W n t	yP } zt d|
   td|
 dd}~ww ||@ }	|	rjt d|	  tdd|	 ddS dS )a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`TypedDict`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: rb  rc  zValidation error: NzUnused fields in the request: )r  rS  setkeys__mutable_keys__rz  r%   r.  validate_pythonrF   errors)
r   r<  r  r  r  
input_keyspossible_keysunexpected_keysr  unused_fields_in_requestrQ   rQ   rR   _validate_request}  s.   

zServe._validate_requestc                 C      | j |tttd d S N)r<  r  r  r  )r  rH   response_validatorUNUSED_RESPONSE_FIELDSr   r<  rQ   rQ   rR   rJ       
zServe.validate_response_requestc                 C   r  r  )r  rU   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr  rQ   rQ   rR   rA    r  z&Serve.validate_chat_completion_requestc                 C   r  r  )r  rV   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr  rQ   rQ   rR   rW    r  z$Serve.validate_transcription_requestr  rD  contentr   rolefinish_reason
tool_callsr1   decode_stream	tokenizerr   r/   c	           
   
   C   s\   |dur|dur|dur| |j|}t|tt |tt|||dd|dgddd}	|	S )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        N)r  r  r  r   )deltaindexr  r  zchat.completion.chunk)r   createdr   choicessystem_fingerprintr\  )step
_tokenizerr/   r   timeChoiceChunkr0   )
r   rD  r  r   r  r  r  r  r  chunkrQ   rQ   rR   build_chat_completion_chunk  s(   "
z!Serve.build_chat_completion_chunkr  zChatCompletionChunk | BaseModelc                 C   s8   t | tr| dr| S d|  dS d| jdd dS )a/  
        Builds an event of a streaming OpenAI Response model or a ChatCompletion chunk.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            chunk (`BaseModel` or `ChatCompletionChunk`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        rf  rj  Texclude_none)
isinstancerO   
startswithmodel_dump_json)r  rQ   rQ   rR   chunk_to_sse_element   s   
zServe.chunk_to_sse_element	cache_dirc              	      s  ddl m}m} g }td tt| jD ]u}|jdkrq|j	}|
 D ]e\}}|j}tdd |D d}	|	s9q%t|	  }
t|
trKd|
v sLq%|
d }|  | t fd	d|D rd
|jv ro|jd
nd}|j|dkr|d| nd }|||d|jd q%q|S )z2
        List LLMs and VLMs in the cache.
        r   !MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMESz/Scanning the cache directory for LLMs and VLMs.r   c                 s   s     | ]}|j d kr|jV  qdS )zconfig.jsonN)	file_name	file_path)r   frQ   rQ   rR   r   )  s    z'Serve.get_gen_models.<locals>.<genexpr>Narchitecturesc                 3   s$    | ]}|g  v r|V  qd S r   rQ   )r   archllmsvlmsrQ   rR   r   7  s   " /r  main@)owned_byr   r\  r  )&transformers.models.auto.modeling_autor  r  r  r  r   r   repos	repo_typerefsr   filesnextr   r   openrR  r  r   r   anyrepo_idsplitr  last_modified)r  r  r  generative_modelsrepor
  refrevision_infor  config_pathr  r  authorrepo_handlerQ   r   rR   r^    s>   

zServe.get_gen_modelsr   z StreamingResponse | JSONResponsec              	      s   |d jk}_|r!jdur!jjddd d_\}}j||dtjkr<t	d 
|S t|drD|jn|t||jjjd	d	d
djdu rj|jd_t j_j  j||d}|d |}|j|d|dddddd d |jfddfdd fdd} fdd}	jj|j|dd}|drt||ddS |	|}
|
jdd}t|d dS )!a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr!   r  r  z`Continuous batching is not supported for non-text-only models. Falling back to regular generate.r  Ffifo)r   eos_token_idpad_token_id	use_cacher   	scheduler)rI   messagestoolsptadd_generation_promptr   return_tensorsreturn_dicttokenize	input_idsr   c           
   
   3   s"   ddl m} z`j| ddV  d}j| D ]K}|d7 }|jr2|jd }j| ||dV  |j|jkrd jd uoA| jk}t	d	rQ|j
k}|oP| }|rUd
nd}j| |dV   W d S qW d S  ty }	 ztt|	 j|  dt|	 dV  W Y d }	~	d S d }	~	ww )Nr!   )RequestStatus	assistantr  r   r   r   r   )rD  r  r   r  r  	eos_tokenlengthri   r  r   data: {"error": ""})generation.continuous_batchingr(  r  r  request_id_itergenerated_tokensr   FINISHEDr   r  r+  r  r  rz  rO   cancel_request)
rD  r  r(  n_tokens_generatedresulttoken_idgenerated_all_tokensfinal_token_is_eosreasonr  )rI   r   r   r  rQ   rR   stream_chat_completion  sL   




" zIServe.continuous_batching_chat_completion.<locals>.stream_chat_completionc                    sv   d }j  r|d u rj j| dd}j  r|d u s|j}t| tt d tdt	|ddddgd	}|S )
Nr   )rD  r  chat.completionr   r)  r  r  ri   r  r{  r  )r   r  r\  r   r  )
r  
is_running
get_resultdecoder2  r+   r   r  r.   r,   )_request_idr6  r  chat_completion_result)r   r   r  rQ   rR   buffer_chat_completion  s$   

zIServe.continuous_batching_chat_completion.<locals>.buffer_chat_completionc                   sx   z t   d}| |D ]}|V  tdI d H  qW d S  tjy;   j|  t	d|  d Y d S w )NFr   Request  was cancelled.)
r   tolistr  r  sleepCancelledErrorr  r4  r  r  )rB  r  _chunk)inputsr   r;  rQ   rR   cancellation_wrapper_stream  s   zNServe.continuous_batching_chat_completion.<locals>.cancellation_wrapper_streamc                    s@   z | W S  t jy   j|  td|  d Y d S w )NrE  rF  )r  rI  r  r4  r  r  )rB  )rD  r   rQ   rR   cancellation_wrapper_buffer  s   
zNServe.continuous_batching_chat_completion.<locals>.cancellation_wrapper_bufferrX   )rD  r   	streamingrG  rH  r  application/json)r  r  r  ri   r  get_model_modalityr   r   r  r  rE  r  r  r   rI   r  r  init_continuous_batchingr   logit_processorr|   *get_processor_inputs_from_inbound_messagesapply_chat_templater   tor#  add_requestr   r(   r  r'   )r   r   rD  must_discard_cacher   r  modalityprocessor_inputsrL  rM  r  
json_chunkrQ   )rD  rI   rK  r   r   r;  r  rR   rB  E  sh   






0
z)Serve.continuous_batching_chat_completionr   c                 C   sj   |d urt |trtjS ddlm}m} | jj}||	 v r#tj
}|S ||	 v r.tj}|S td| )Nr   r  zUnknown modality: )r  r   r   r   r  r  r  r   rK   r   r   r  )r   r  r  r  model_classnamerX  rQ   rQ   rR   rP    s   
zServe.get_model_modalityrX  c                 C   s~  g }| D ]}|d g d}|t jkrEt|d tr|d }n"t|d tr@g }|d D ]}|d dkr:||d  q+d|}||d< nr|t jkrt|d tr^|d d|d d nY|d D ]T}|d dkrr|d | qb|d dkrd	|d d
 v rt	dd|d d
 }t
tt|}tjddd}	|	j}
||	j n|d d
 }
|d d|
d qb|| q|S )Nr  r  r  r  typer[    )r]  r[   	image_urlbase64urlz^data:image/.+;base64,r  z.pngF)suffixdeleteimage)r]  ra  )r   r   r  rO   r[  r  r  r   resubr#   r  r   r`  	b64decodetempfileNamedTemporaryFiler   save)r  rX  rY  r{  parsed_messageparsed_contentr  
image_datard  rW   ra  rQ   rQ   rR   rS    s@   




z0Serve.get_processor_inputs_from_inbound_messagesc                    s\  j dur
j |d< |d }|d d dkrdS |d jk}_\}j|d}||}dtD ]}|jjd 	 v rO| nq?|j
|d	|d
dd	d	d}|j}|ddd	}	djjd 	 v rxd}	t||	d	d}
|r|sj }|d jd |krj}nd}n	|d jd }d}t|jd i ||
 d	|d fdd}|drttj||
ddS g }d}||
}d}|D ]%}|jd }t|jddr||jj |jr|j}t|ddr|j}qtt t!! dt"dt#d$|dd|d g|d!}|j%d	d"}t&|d#dS )$a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   r  r   r  r)  r  r   Tr   r!  r"  rD  req_0gptossFskip_special_tokensskip_promptr'  r   )streamerrI   return_dict_in_generatepast_key_valuesc              
   3   s   d}d }dj jd  v rd}d}fdd}t|d}d	}zSz |  t }jd
dV  d	}d}	| D ]}|	d7 }	dj jd  v rQ|d}||7 }|r_||v r^d}q<q<d ur| t	 d krrd|_
q<| t	 d kr|  j|d ddV  q<|j
r| j|7  _|jstd|j}
|
d u rq<|
d}
d|_tt|
ddd|d d}n<|d	krq<d|jvrq<| j|d7  _| j|d8  _|jdk rd	|dd d d }tt|dddd}j|d |gdV  q<|d	krj||dV  q< jd uo|	 jk}t| jdr0|| jjk}|o/| }|r5d nd!}j||d"V  |  W n# tyh } ztt| d#t| d$V  W Y d }~nd }~ww W |  d S W |  d S |  w )%NFro  r   T<|channel|>final<|message|>c                         j di | }|j_d S NrQ   generaterv  r  r   generate_outputr   r   rQ   rR   generate_with_cache     z[Serve.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cacher  r   r  r)  r*  r   
<|return|>r|   r}   r  )rD  r  r  r   z\"name\": \"(.*?)\")r   function
_tool_call)r  r  r]  r   z"arguments": {{})	arguments)r  r  r]  )rD  r  r  r   )r  r   r+  r,  ri   r-  r.  r/  )r  r  r  r   r|   r  r  removesuffixstrip_TOOL_CALL_TOKENSr  r  r	  r  re  searchgroupr1   r2   r  r   r  r  r   r  r  r+  r  r  rz  rO   )rt  rB  
filter_cotcot_trace_endr  threadresults
tool_stater6  r5  	tool_nametoolr8  r9  r:  r  rI   generation_kwargsr   r   rD  r   tool_model_familyrQ   rR   r;    s   





z>Serve.generate_chat_completion.<locals>.stream_chat_completionrX   rG  rH  ri   r  usager<  r  r=  r>  )r   r  r\  r   r  r  r  rO  )'r/  r  r  r  rP  rS  _MODELS_WITH_TOOL_SUPPORTr  r  r  rT  r   rU  r#  r   is_continuationr  get_seq_lengthshaper   rI   r(   mapr  r  r   r  r  r  r  r  r+   r   r  r.   r,   r  
model_dumpr'   )r   r   r  rW  r  rX  rY  supported_model_familiesrK  rq  generation_streamerseq_lenr  r;  r  r  	generatorr  r  choicerC  r6  rQ   r  rR   rE  2  s   



 



	zServe.generate_chat_completionc                    s   d jk}_\}td tr6dv r)dd dgng }|dd d nUtd trjdv red d d dkrXdd dgd }n3d }d |d d	< n&d }n!td trdv r}dd dgng }|d  ntd
|j	|ddddd }|
j}ddd}djjd  v rd}t||dd}tjd}d}r|sֈj }	|jd |	krֈj}|t|||d|d  fdd}
|
|S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemr\  r\   r   r  r  %inputs should be a list, dict, or strTr!  r#  r$  r%  r'  rg   rn  ro  Frp  rs  Nr   )rK  attention_maskrt  rI   ru  rv  c                 3   s\   d}d }dj jd  v rd}d}fdd}t| d}d}d}d}zz|  t }	td	|td
 |	dddddiidg g dddddd}
|d7 }	|
V  t
d|td
 |	dddddiidg g dddddd}|d7 }	|V  td||td dddg dd}|d7 }	|V  tdd |||td d!g d"d#}|d7 }	|V  d!}| D ]U}dj jd  v r|d$}||7 }|r||v rd}d!}qtd%d ||||g d&}|d7 }	|V  q|r#td%d ||||g d&}|d7 }	|V  qtd'd ||d|g d(}|d7 }	|V  td)d |||td |jg d"d#}|d7 }|d7 }	|V  td*||td dd+d|jgg d,d}|d7 }|d7 }	|V  td-|td
 |	d+ddddii|jgdg ddddd.d}|d7 }	|V  |  W nc ty } zVtd/t|  td0|t|d1}|d7 }	|V  td2|td
 |	d3ddddiig dg dddtd4t|d5d6d}|d7 }	|V  W Y d }~nd }~ww W |  d S W |  d S |  w )7NFro  r   Trw  c                     rx  ry  rz  r|  r~  rQ   rR   r    r  zMServe.generate_response.<locals>.stream_response.<locals>.generate_with_cacher  zresponse.createdresp_queuedr  formatr]  r[   r  rv   r  rl   )r   
created_atr   r   r  r[   r\  r   rN  rv   rc   rl   )r]  sequence_numberr  r   zresponse.in_progressin_progresszresponse.output_item.addedmsg_r{  r)  )r   r]  r   r  r  )r]  r  output_indexr   zresponse.content_part.addedoutput_textr  r]  r[   annotations)r]  item_idr  r  content_indexpartr  zresponse.output_text.delta)r]  r  r  r  r  r  rk   zresponse.output_text.done)r]  r  r  r  r  r[   rk   zresponse.content_part.donezresponse.output_item.done	completedr   r]  r   r  r  r  zresponse.completedr   r  r   r   r  r[   rN  r\  r   rv   rc   rl   z"Exception in response generation: rz  )r]  r  r{  zresponse.failedfailedserver_error)coder{  )r   r  r   r   r  r[   rN  r\  r   rv   rc   rl   rz  ) r  r  r  r   r|   r  r8   r4   r   r  r<   r=   r?   r6   r@   r  rA   rB   r7   r[   r>   r  r5   r   r  r  r  rz  rO   r:   r;   r9   )rt  rB  r  r  r  r  r  r  r  r  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedr  r6  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedr  error_eventresponse_failedr  r   r   r   rD  r   rQ   rR   stream_response  s  




			


%z0Serve.generate_response.<locals>.stream_response)r  r  r  r  rO   r  r[  r   	TypeErrorrT  rU  r#  r   r  r  r  r   r   rI   r  r  r  r  r   )r   r   rW  r  rK  rq  r  rI   r  r  r  rQ   r  rR   rL  B  sb   


	 
ozServe.generate_responsec                 C   sP  |  |d }|| jk}|| _| |\}}t|d tr6d|v r)d|d dgng }|d|d d nUt|d trjd|v re|d d d dkrXd|d dg|d }n3|d }|d |d d	< n&|d }n!t|d trd|v r}d|d dgng }||d  ntd
|j	|ddddd }|
|j}|dd}d}d|jjd  v rd}t||jd}	d}
| |r|s| j }|jd |kr| j}
|j|t||	d|
d}|j| _|j|j|dd }t }td| dddtd|g dgg d}td| |d||dd d!d"ii|gd#g |d$dd%|d&d'}|jdd(S ))a  
        Generates an OpenAI Response in non-streaming mode (single JSON payload).

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `dict`: The OpenAI `Response` serialized as a dict.
        r   r  r  r  r\  r\   r   r  r  r  Tr!  r  r'  rg   rn  ro  Frs  Nr   )rK  r  rI   ru  rv  rq  r  r{  r  r)  r  r  r  r  r  r]  r[   r  rv   r  rl   r  r  ) r  r  r  r  rO   r  r[  r   r  rT  rU  r#  r   r  r  r  r   rI   r  r  r  r  r{  r   rv  batch_decode	sequencesr  r?   r@   r4   r  )r   r   r   rW  r   r  rK  rD  rq  rI   r  r  r}  	full_textr  response_output_itemr  rQ   rQ   rR   rK  u  s   





z%Serve.generate_response_non_streamingc           
         s   t  std| |d }| |\tjddd}t|jd}jj	}t
|d }tj||dd\}}||dd	j  d
 j d
< ||dd fdd}	|	 S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Trp  rs  rW   )srmonor!  )sampling_rater$  input_features)rt  rI   ru  c                  3   sH    j di  } j| jddd }t|d}|jdd V  d S )NTr  r   )r[   r  rQ   )r{  r  r  r)   r  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorr  rQ   rR   _generate_transcription  s
   
z=Serve.generate_transcription.<locals>._generate_transcription)r   r  r  load_audio_model_and_processorr   r  r   rI   feature_extractorr  ior   librosaloadrU  r#  r$  )
r   r   r   r  rI   model_sampling_rateaudio_bytesaudio_arrayr   r  rQ   r  rR   rX    s2   zServe.generate_transcriptionc                 C   sx   | dp	| d}d}| jdu rd}n#t| jt|kr d}ntt| jD ]}| j| || kr6d} nq'|| _|S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        r  r  TNF)r   r  lenrange)r   r   r  req_continues_last_messagesirQ   rQ   rR   r    s   
zServe.is_continuationc                 C   sP   | j dkrtdddd}n| j dkrtdd}nd}|dur&td|  |S )	z
        Returns the quantization config for the given CLI arguments.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        zbnb-4bitTnf4)load_in_4bitbnb_4bit_quant_typebnb_4bit_use_double_quantzbnb-8bit)load_in_8bitNz0Quantization applied with the following config: )r'  r   r  r  )r   quantization_configrQ   rQ   rR   get_quantization_config  s   

zServe.get_quantization_configmodel_idc                 C   s&   | j dur| j }d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        Nr  z@main)r/  )r   r  rQ   rQ   rR   r  4  s
   

zServe.process_model_namer   progress_callbackc                    s  ddl }ddlm}m} } }|durt||nd}dtf fdd}	td   |	d d	 v r> d	d
\}
}n d}
}z|j	|
|| j
d}W n  tyn   ztj	|
|| j
d}W n tyk   tdw Y nw | jdv rw| jnt|| j}|  }|| j|| j| j
|d}|	d |j	|
fi |}tt|jd }|j	|
fd|i|}|jjdu o|jjdk}|jjduo|jjdk }|s|rd|j_||fS )a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        r   N)
AutoConfigAutoProcessorr   c                    s    d u rd S d | d d S )Nr   )r   r   r   rQ   )r   r   r  rQ   rR   emit_progress_  s   z;Serve._load_model_and_data_processor.<locals>.emit_progresszLoading r  r  r   r  )revisionr%  zBFailed to load processor with `AutoProcessor` and `AutoTokenizer`.)r  N)r  r&  r$  
device_mapr%  r  r  
tqdm_class   rP  )r   r2  r  r  r  rO   r  r  r  from_pretrainedr%  OSErrorr   r$  r   r  r&  r#  r  rI   r   
max_length)r   r   r  r   r  r  r   r   r  r  r  r  data_processorr$  r  model_kwargsr  architecturer   has_default_max_lengthhas_short_max_new_tokensrQ   r  rR   _load_model_and_data_processorE  sb   


	z$Serve._load_model_and_data_processor)r   r   c                 C   s   | j  | j|t }W d   n1 sw   Y  |X || jvs+| j|  rL| j||d\}}t|| j	|d| j|< |durK|d|dd n| j| 
  | j| j}| j| j}|durk|d|dd ||fW  d   S 1 syw   Y  dS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        N)r  r  r  rg  Frh  T)r  r  
setdefaultr  r  r  r  r  r
  r*  r  r   r  )r   r   r  lockr   r  rQ   rQ   rR   r    s2   

$zServe.load_model_and_processor)r   r    c                 C   s   | j  | j|t }W d   n1 sw   Y  |F || jvs+| j|  rFtd|  | 	|\}}t
|| j|d| j|< n| j|   | j| j}| j| j}||fW  d   S 1 sgw   Y  dS )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        NzLoading model into cache: r  )r  r  r  r  r  r  r  r  r  r  r
  r*  r  r   r  )r   r   r  r  r  rQ   rQ   rR   r    s&   
$z$Serve.load_audio_model_and_processor)Nr  r  FNNr  r  r   r  NFFNF)r  NNNNNNNr   )/rK   rL   rM   r
   rZ   typerOptionrO   r   r   r  r  r4  r   r   r  r  rJ  rA  rW  r[  r   r   r  staticmethodr  r   r  r^  rB  r   rP  rS  rE  r   rL  rK  rX  r  r   r  r  r   r  tupler  r  rQ   rQ   rQ   rR   r  -  s`   


"%(./58
  .

1

	

:(/ +-    5Z0
V
(r  a  
Run a FastAPI server to serve models on-demand with an OpenAI compatible API.

Models will be loaded and unloaded automatically based on usage and a timeout.


The server will expose the following endpoints:
    - POST /v1/chat/completions: Generates chat completions.
    - POST /v1/responses: Generates responses.
    - POST /v1/audio/transcriptions: Generates transcriptions from audio.
    - GET /v1/models: Lists available models for 3rd party tools.

Requires FastAPI and Uvicorn to be installed.
__main__)r  r`  r   enumr  r  r   re  rh  r  r  r  collections.abcr   r   r   
contextlibr   	functoolsr   r   r   typingr	   r
   r   r   r   r  huggingface_hubr   tokenizers.decodersr   r   	tqdm.autor  r2  r   r   r   r   transformers.generationr   r   transformers.utilsr   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r    r0  r"   r  PILr#   r  r  r  r$   r%   fastapi.middleware.corsr&   fastapi.responsesr'   r(    openai.types.audio.transcriptionr)   .openai.types.audio.transcription_create_paramsr*   openai.types.chatr+   r,   r-   !openai.types.chat.chat_completionr.   'openai.types.chat.chat_completion_chunkr/   r0   r1   r2   r  *openai.types.chat.completion_create_paramsr3   openai.types.responsesr4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   -openai.types.responses.response_create_paramsrC   pydanticrD   rE   rF   rH   rU   rV   r  r  r  r  r  r  r  rK   r  r  r[  r  r  r  r   r   r   Enumr   r   r   r   r   r   r  r  r
  r  rN   r  rQ   rQ   rQ   rR   <module>   s    
D



;*-=14             E

