o
    :/i*                     @   s  U d dl Z d dlZd dlZd dlmZmZ d dlmZmZ d dl	m
Z
mZmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZmZmZmZmZmZmZ d d
lm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' d dlm(Z) d dlm*Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z2 d dl3m4Z4 d dl5m6Z6m7Z7m8Z8 d dl9m:Z:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZC d dlDmEZEmFZFmGZG d dlHmIZImJZJmKZKmLZLmMZMmNZN d dlOmPZPmQZQ d dlRmSZS d dlTmUZU d dlVmWZW d dlXmYZY erd dlZZZd dl[Z[neYde\ dZ[eYde\ dZZeAe]Z^d e_fd!d"Z`G d#d$ d$eaZbd%d&d'd(ZcG d)d* d*e;d+d,ZdG d-d. d.e;d+d,ZeG d/d0 d0e;d+d,ZfG d1d2 d2e;d+d,ZgG d3d4 d4e;d+d,ZhG d5d6 d6e;d+d,ZiG d7d8 d8e6ZjG d9d: d:e;d+d,ZkG d;d< d<e;d+d,ZlG d=d> d>e;d+d,ZmG d?d@ d@e;d+d,ZnG dAdB dBe;d+d,Zoe)eeB e"B eiB e#B ekB elB efB egB emB enB e_B eoB Z(eepdC< G dDdE dEe;d+d,Zqe+eqB e2B Z*eepdF< G dGdH dHe;d+d,ZredI ZsedJ ZtedK ZuedLZvG dMdN dNeKZwdOexeZjy dPeSfdQdRZzdSexe{e_dTf  dPeSfdUdVZ|dWe_dSexe dPeSfdXdYZ}G dZd[ d[eeev Z~d\exeee_dB f  dPeSd]exe_ fd^d_Zd`e{e_exeee_dB f  f dPeSdae{e_exe_ f dbeeFeGf fdcddZG dedf dfe~eee_dB f  ZG dgdh dhe~e
eee_dB f   ZG didj djeZG dkdl dleZG dmdn dneZeG dodp dpZdqee_B dB fdrdsZd+dtdqee_B dB duedbe_dB fdvdwZeeZd+dtdqee_B dB duedbe_dB fdxdyZdze{e_exf d{exe_ dbe_fd|d}Zdze{e_exf d{exe_ d~edbe_fddZeee$ZeeefZeeegZeee"Zeee#ZeeekZeeeoZe8e!jZe8eejZe8eijZe8e/jZe_e{e_e_f B e-B ejB Zeepd< dd dd dd dd dd dd dd dd dd dd dd dd dd dZe{e_ee(gef f epd< de(dbee_ef fddZdZddde_dee( de~ded~ede{e_ef dB dbexer fddZde(deded~edbedB f
ddZeee Zeee'Z	dde*de~detd~ede{e_ef dB dbexer fddZdexer dbdfddZ		ddexe* de?detde{e_e{e_ef f dB de{e_ef dB dbeexer eFdB eGdB f fddZ		ddexe* de?detde{e_e{e_ef f dB de{e_ef dB dbeexer eFdB eGdB f fddZdexer fddZdde_fddZdS )    N)ABCabstractmethod)Counterdefaultdict)	AwaitableCallableIterable)	dataclass)cached_property	lru_cachepartial)
accumulate)Path)TYPE_CHECKINGAnyGenericLiteral	TypeAliasTypeVarcast)#ChatCompletionAssistantMessageParam#ChatCompletionContentPartImageParam(ChatCompletionContentPartInputAudioParam%ChatCompletionContentPartRefusalParam"ChatCompletionContentPartTextParamChatCompletionFunctionToolParam"ChatCompletionMessageToolCallParamChatCompletionToolMessageParam)ChatCompletionContentPartParam)ChatCompletionMessageParam)
InputAudio)ResponseInputImageParam)Message)Image)	BaseModel
ConfigDictTypeAdapter)Required	TypedDict)envs)ModelConfig)init_logger)SupportsMultiModal)MULTIMODAL_REGISTRYMultiModalDataDictMultiModalUUIDDict)MultiModalBatchedFieldMultiModalFlatFieldMultiModalSharedFieldVisionChunkVisionChunkImageVisionChunkVideo)MEDIA_CONNECTOR_REGISTRYMediaConnector)BaseMultiModalProcessorrandom_uuid)
is_list_of)
LazyLoadertransformerstorchnamec                 C   s<   | dkrddl m} tjdtdd |S tdtd| )	Nresolve_hf_chat_templater   )resolve_chat_templatez`vllm.entrypoints.chat_utils.resolve_hf_chat_template` has been moved to `vllm.renderers.hf.resolve_chat_template`. The old name will be removed in v0.16.   )
stacklevelzmodule z has no attribute )vllm.renderers.hfrA   warningswarnDeprecationWarningAttributeError__name__)r?   rA    rJ   h/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/entrypoints/chat_utils.py__getattr__F   s   rL   c                   @   s   e Zd ZdZdS )ChatTemplateResolutionErrorzRaised when chat template resolution fails.

    This is a subclass of ValueError for backward compatibility with
    existing exception handlers.
    N)rI   
__module____qualname____doc__rJ   rJ   rJ   rK   rM   W   s    rM   z<##IMAGE##>z<##AUDIO##>z<##VIDEO##>)imageaudiovideoc                   @      e Zd ZU ee ed< dS )AudioURLurlNrI   rN   rO   r'   str__annotations__rJ   rJ   rJ   rK   rU   f      
 rU   F)totalc                   @   *   e Zd ZU ee ed< eed  ed< dS )#ChatCompletionContentPartAudioParam	audio_urltypeN)rI   rN   rO   r'   rU   rY   r   rJ   rJ   rJ   rK   r]   m      
 r]   c                   @   F   e Zd ZU eeeef B dB ed< 	 eed  ed< 	 edB ed< dS ))ChatCompletionContentPartImageEmbedsParamNimage_embedsr_   uuidrI   rN   rO   rX   dictrY   r'   r   rJ   rJ   rJ   rK   rb   t      
 rb   c                   @   ra   ))ChatCompletionContentPartAudioEmbedsParamNaudio_embedsr_   rd   re   rJ   rJ   rJ   rK   rh      rg   rh   c                   @   rT   )VideoURLrV   NrW   rJ   rJ   rJ   rK   rj      rZ   rj   c                   @   r\   )#ChatCompletionContentPartVideoParam	video_urlr_   N)rI   rN   rO   r'   rj   rY   r   rJ   rJ   rJ   rK   rk      r`   rk   c                   @   s&   e Zd ZU dZejed< eddZdS )PILImagez#
    A PIL.Image.Image object.
    	image_pilT)arbitrary_types_allowedN)rI   rN   rO   rP   r#   rY   r%   model_configrJ   rJ   rJ   rK   rm      s   
 
rm   c                   @   s*   e Zd ZU dZedB ed< edB ed< dS )(CustomChatCompletionContentPILImageParamzA simpler version of the param that only accepts a PIL image.

    Example:
    {
        "image_pil": ImageAsset('cherry_blossom').pil_image
    }
    Nrn   rd   )rI   rN   rO   rP   rm   rY   rX   rJ   rJ   rJ   rK   rq      
   
 rq   c                   @   *   e Zd ZU dZedB ed< edB ed< dS )+CustomChatCompletionContentSimpleImageParamzA simpler version of the param that only accepts a plain image_url.
    This is supported by OpenAI API, although it is not documented.

    Example:
    {
        "image_url": "https://example.com/image.jpg"
    }
    N	image_urlrd   rI   rN   rO   rP   rX   rY   rJ   rJ   rJ   rK   rt      s
   
 	rt   c                   @   s   e Zd ZU dZedB ed< dS )+CustomChatCompletionContentSimpleAudioParamzA simpler version of the param that only accepts a plain audio_url.

    Example:
    {
        "audio_url": "https://example.com/audio.mp3"
    }
    Nr^   rv   rJ   rJ   rJ   rK   rw      s   
 rw   c                   @   rs   )+CustomChatCompletionContentSimpleVideoParamzA simpler version of the param that only accepts a plain audio_url.

    Example:
    {
        "video_url": "https://example.com/video.mp4"
    }
    Nrl   rd   rv   rJ   rJ   rJ   rK   rx      rr   rx   c                   @   s:   e Zd ZU dZee ed< 	 eed< 	 eed  ed< dS )!CustomThinkCompletionContentParamzA Think Completion Content Param that accepts a plain text and a boolean.

    Example:
    {
        "thinking": "I am thinking about the answer",
        "closed": True,
        "type": "thinking"
    }
    thinkingclosedr_   N)	rI   rN   rO   rP   r'   rX   rY   boolr   rJ   rJ   rJ   rK   ry      s   
 
ry   r   c                   @   sz   e Zd ZU dZee ed< 	 eee B ed< 	 eed< 	 edB ed< 	 e	e
 dB ed< 	 edB ed< 	 ee dB ed	< dS )
 CustomChatCompletionMessageParamz0Enables custom roles in the Chat Completion API.rolecontentr?   Ntool_call_id
tool_calls	reasoningtools)rI   rN   rO   rP   r'   rX   rY   listr   r   r   r   rJ   rJ   rJ   rK   r}     s    
 r}   r   c                   @   s   e Zd ZU ee ed< 	 edB eeeef  B ed< 	 edB ed< 	 edB ed< 	 ee	 dB ed< 	 edB ed< 	 edB ed< 	 ee
 dB ed	< dS )
ConversationMessager~   Nr   r   r?   r   r   reasoning_contentr   )rI   rN   rO   r'   rX   rY   r   rf   r   r   r   rJ   rJ   rJ   rK   r   6  s"   
 r   )autostringopenai)r   r   )rQ   rR   rS   rc   ri   vision_chunk_Tc                   @   s   e Zd ZdS )_BatchedSingleItemFieldN)rI   rN   rO   rJ   rJ   rJ   rK   r   ^  s    r   tensorsmm_processorc                    s   | d }|j jj }t| dkr.|jdkr.|jd dkr.|jd |kr.td t	ddS |j t
 fdd| D r?t S d	d
 | D }dgt|fdd
tt|D }t|dS )Nr         zBatched multi-modal embedding inputs are deprecated for Chat API. Please pass a separate content part for each multi-modal item.)
batch_sizec                 3   s    | ]}|j  kV  qd S N)shape).0t)first_shaperJ   rK   	<genexpr>v  s    z _detect_field.<locals>.<genexpr>c                 S   s   g | ]}t |qS rJ   )len)r   tensorrJ   rJ   rK   
<listcomp>y      z!_detect_field.<locals>.<listcomp>c                    s$   g | ]}t  |  |d   fqS )r   )slice)r   i)
slice_idxsrJ   rK   r   {  s    )slices)infoctxrp   get_inputs_embeds_sizer   ndimr   loggerwarningr   allr0   r   ranger1   )r   r   
first_itemhidden_sizesize_per_itemr   rJ   )r   r   rK   _detect_fieldb  s&   




r   
data_itemsztorch.Tensorc                    s    si S t  d  tfdd dd  D rtd fddD  fdd D }z6t|i fd	dD fd
dD }|D ] jfdd D dd|< qSW |S  t	yy   t
d Y |S w )Nr   c                 3   s     | ]}t |  kV  qd S r   )setkeysr   item)
first_keysrJ   rK   r     s    z _merge_embeds.<locals>.<genexpr>r   zCAll dictionaries in the list of embeddings must have the same keys.c                    s&   i | ]  t  fd dD qS )c                       g | ]}|  qS rJ   rJ   r   keyrJ   rK   r     r   ,_merge_embeds.<locals>.<dictcomp>.<listcomp>)r   )r   )r   r   r   rK   
<dictcomp>  s    z!_merge_embeds.<locals>.<dictcomp>c                    s.   i | ]\ } |j  fd dD ddqS )c                    r   rJ   rJ   r   r   rJ   rK   r     r   r   F
pin_memory)_reduce_data)r   field)r   r   rK   r     s    c                    s   i | ]}| | j qS rJ   )r   r   r   )parsed_configsrJ   rK   r         c                    s.   g | ]} | | krt  | ts|qS rJ   )
isinstancer   r   )fieldsparsed_fieldsrJ   rK   r     s    z!_merge_embeds.<locals>.<listcomp>c                    r   rJ   rJ   r   r   rJ   rK   r     r   Fr   zKError when parsing merged embeddings. Falling back to auto-detected fields.)r   r   any
ValueErroritems_get_mm_fields_configr=   BatchFeaturer   	Exceptionr   	exception)r   r   data_mergedkeys_to_updaterJ   )r   r   r   r   r   r   r   rK   _merge_embeds  sD   
	
r   modalityc                    s|   t |dkr|S tdd |D r|S t|tjr.|  d  fdd|D }t||  S t|tr8t||S tt|)Nr   c                 s   s    | ]}|d u V  qd S r   rJ   r   rJ   rJ   rK   r         z#_get_embeds_data.<locals>.<genexpr>_embedsc                    s   g | ]} |iqS rJ   rJ   r   
embeds_keyrJ   rK   r     r   z$_get_embeds_data.<locals>.<listcomp>)	r   r   r;   r>   Tensorr   rf   NotImplementedErrorr_   )r   r   r   
dict_itemsrJ   r   rK   _get_embeds_data  s   


r   c                	       s  e Zd ZdZ	d dedeeeeef f dB f fddZe	de
fdd	Zedefd
dZe	dee fddZedeeeeef f dB fddZedd Zedd Zedd Ze	dd ZdedededB fddZe	d deeef dB ddfddZ  ZS )!BaseMultiModalItemTrackerz
    Tracks multi-modal items in a given request and ensures that the number
    of multi-modal items in a given request does not exceed the configured
    maximum per prompt.
    Nrp   media_io_kwargsc                    sF   t    || _|| _tttt f t| _tttt f t| _	d S r   )
super__init___model_config_media_io_kwargsr   rX   r   r   _items_by_modality_modality_order)selfrp   r   	__class__rJ   rK   r     s
   
z"BaseMultiModalItemTracker.__init__returnc                 C   s   t | jjddS )zDCheck if model uses unified vision_chunk modality for images/videos.use_unified_vision_chunkF)getattrr   	hf_configr   rJ   rJ   rK   !use_unified_vision_chunk_modality  s   z;BaseMultiModalItemTracker.use_unified_vision_chunk_modalityc                 C   s   | j S r   )r   r   rJ   rJ   rK   rp     s   z&BaseMultiModalItemTracker.model_configc                 C   s$   ddl m} || j}ttt |S )Nr   )get_model_cls) vllm.model_executor.model_loaderr   rp   r   r_   r,   )r   r   	model_clsrJ   rJ   rK   r     s   
z#BaseMultiModalItemTracker.model_clsc                 C   s   | j p| jjr| jjjS d S r   )r   r   multimodal_configr   r   rJ   rJ   rK   r     s   
z)BaseMultiModalItemTracker.media_io_kwargsc                 C      | j jS r   )r   allowed_local_media_pathr   rJ   rJ   rK   r        z2BaseMultiModalItemTracker.allowed_local_media_pathc                 C   r   r   )r   allowed_media_domainsr   rJ   rJ   rK   r     r   z/BaseMultiModalItemTracker.allowed_media_domainsc                 C   s   t S r   )r-   r   rJ   rJ   rK   mm_registry     z%BaseMultiModalItemTracker.mm_registryc                 C   s   | j | jS r   )r   create_processorrp   r   rJ   rJ   rK   r     s   z&BaseMultiModalItemTracker.mm_processorr   r   c                 C   s   | dd}|}| jo|dv }|rd}t| j| d }n	t| j| d }| jj}|dur>|jr>||dkr>|dr>n| j	j
|| |rY| j| | | jd | n| j| | | j||S )z
        Add a multi-modal item to the current prompt and returns the
        placeholder string to use, if any.

        An optional uuid can be added which serves as a unique identifier of the
        media.
        r    )rS   rQ   r   r   Nr   )replacer   r   r   rp   r   enable_mm_embedsget_limit_per_promptendswithr   r   validate_num_itemsappendr   r   get_placeholder_str)r   r   r   input_modalityoriginal_modalityuse_vision_chunk	num_items	mm_configrJ   rJ   rK   add  s.   zBaseMultiModalItemTracker.addmm_processor_kwargsBaseMultiModalContentParserc                 C      t r   r   r   r   rJ   rJ   rK   create_parser6     z'BaseMultiModalItemTracker.create_parserr   )rI   rN   rO   rP   r*   rf   rX   r   r   r
   r|   r   propertyrp   r_   r,   r   r   r   r   r   r   ModalityStrr   r   r   r  __classcell__rJ   rJ   r   rK   r     s@    	$



/r   vision_chunk_itemsvision_chunks_modality_orderc                 C   s~  dd | D }t | t |ksJ dt |  dt | dg }d}t|| D ]\}\}}|dkrJt|drD|j}	|td|	|d	 q'|| q'|d
krt|dr|d urz@|p\t }
t|trmt |dkrm|d }n|}|	|}t
|D ]\}}|td|d |
 d| ||d d qx|d7 }W q' ty } ztd| || W Y d }~q'd }~ww || q'||fS )Nc                 S      g | ]\}}|qS rJ   rJ   r   datard   rJ   rJ   rK   r   D  r   z/_resolve_vision_chunk_items.<locals>.<listcomp>zvision_chunk items (z) and modality_order (z) must have same lengthr   rQ   media)r_   rQ   rd   rS   split_video_chunksr   video_chunk-prompt)r_   r  rd   	video_idxr  z Failed to split video chunks: %s)r   ziphasattrr  r   r4   r:   r   tupler  	enumerater5   r   r   r   )r	  r   r
  vision_chunks_uuidsprocessed_chunksr  inner_modalityr  rd   
image_data
video_uuid
video_datavideo_chunksr   vcerJ   rJ   rK   _resolve_vision_chunk_items=  sZ   



	
r!  items_by_modalitymodality_orderr   c                 C   s  d| v rd| v rt dd| v rd| v rt di }i }d| v r:tddd | d D ||d< d	d | d D |d< d| v rTd
d | d D |d< dd | d D |d< d| v rrtddd | d D ||d< dd | d D |d< d| v rdd | d D |d< dd | d D |d< d| v rdd | d D |d< dd | d D |d< d| v rt| d ||dg \}}||d< ||d< ||fS )NrQ   rc   z4Mixing raw image and embedding inputs is not allowedrR   ri   z4Mixing raw audio and embedding inputs is not allowedc                 S      g | ]\}}|qS rJ   rJ   r  rJ   rJ   rK   r     r   z"_resolve_items.<locals>.<listcomp>c                 S   r  rJ   rJ   r  rJ   rJ   rK   r     r   c                 S   r$  rJ   rJ   r  rJ   rJ   rK   r     r   c                 S   r  rJ   rJ   r  rJ   rJ   rK   r     r   c                 S   r$  rJ   rJ   r  rJ   rJ   rK   r     r   c                 S   r  rJ   rJ   r  rJ   rJ   rK   r     r   c                 S   r$  rJ   rJ   r  rJ   rJ   rK   r     r   c                 S   r  rJ   rJ   r  rJ   rJ   rK   r     r   rS   c                 S   r$  rJ   rJ   r  rJ   rJ   rK   r     r   c                 S   r  rJ   rJ   r  rJ   rJ   rK   r     r   r   )r   r   r!  get)r"  r   r#  mm_datamm_uuidsr  vision_chunk_uuidsrJ   rJ   rK   _resolve_items{  sL   
r)  c                   @   L   e Zd ZdeedB edB f fddZ	d	deee	f dB ddfddZ
dS )
MultiModalItemTrackerr   Nc                 C   s    | j sdS tt| j | j| jS )NNN)r   r)  rf   r   r   r   rJ   rJ   rK   resolve_items  s
   z#MultiModalItemTracker.resolve_itemsr   r   c                 C      t | |dS Nr   )MultiModalContentParserr  rJ   rJ   rK   r    s   z#MultiModalItemTracker.create_parserr   rI   rN   rO   r  r.   r/   r-  rf   rX   r   r  rJ   rJ   rJ   rK   r+    s    
r+  c                   @   r*  )
AsyncMultiModalItemTrackerr   Nc                    s6   | j sdS dd | j  D I d H }t|| j| jS )Nr,  c                    s$   i | ]\}}|t j| I d H qS r   )asynciogather)r   r   corosrJ   rJ   rK   r     s
    z<AsyncMultiModalItemTracker.resolve_items.<locals>.<dictcomp>)r   r   r)  r   r   )r   resolved_items_by_modalityrJ   rJ   rK   r-    s   
z(AsyncMultiModalItemTracker.resolve_itemsr   r   c                 C   r.  r/  )AsyncMultiModalContentParserr  rJ   rJ   rK   r    s   z(AsyncMultiModalItemTracker.create_parserr   r2  rJ   rJ   rJ   rK   r3    s    
r3  c                       sh  e Zd Zd! fddZdededB fddZdeeef fd	d
Z	e
d"dedB dedB ddfddZe
	d"deeeef B dB dedB ddfddZe
	d"dejdB dedB ddfddZe
d"dedB dedB ddfddZe
	d"dedB dedB ddfddZe
	d"deeeef B dB dedB ddfddZe
d"dedB dedB ddfdd Z  ZS )#r   r   Nc                    s   t    tt| _d S r   )r   r   r   r   _placeholder_storager   r   rJ   rK   r     s   
z$BaseMultiModalContentParser.__init__r   placeholderc                 C   s$   t | }|r| j| | d S d S r   )MODALITY_PLACEHOLDERS_MAPr9  r   )r   r   r:  mod_placeholderrJ   rJ   rK   _add_placeholder  s   z,BaseMultiModalContentParser._add_placeholderc                 C   s
   t | jS r   )rf   r9  r   rJ   rJ   rK   mm_placeholder_storage  s   
z2BaseMultiModalContentParser.mm_placeholder_storageru   rd   c                 C   r  r   r  )r   ru   rd   rJ   rJ   rK   parse_image  r   z'BaseMultiModalContentParser.parse_imagerc   c                 C   r  r   r  )r   rc   rd   rJ   rJ   rK   parse_image_embeds     z.BaseMultiModalContentParser.parse_image_embedsrn   c                 C   r  r   r  )r   rn   rd   rJ   rJ   rK   parse_image_pil  r  z+BaseMultiModalContentParser.parse_image_pilr^   c                 C   r  r   r  )r   r^   rd   rJ   rJ   rK   parse_audio  r   z'BaseMultiModalContentParser.parse_audioinput_audioc                 C   r  r   r  )r   rD  rd   rJ   rJ   rK   parse_input_audio  r  z-BaseMultiModalContentParser.parse_input_audiori   c                 C   r  r   r  )r   ri   rd   rJ   rJ   rK   parse_audio_embeds  rA  z.BaseMultiModalContentParser.parse_audio_embedsrl   c                 C   r  r   r  )r   rl   rd   rJ   rJ   rK   parse_video  r   z'BaseMultiModalContentParser.parse_video)r   Nr   )rI   rN   rO   r   r  rX   r=  rf   r   r>  r   r?  r@  r#   rB  rC  r    rE  rF  rG  r  rJ   rJ   r   rK   r     s\    ""*r   c                       sN  e Zd Z	ddedeeef dB ddf fddZede	fddZ
dd	edB d
edB ddfddZ	ddeeeef B dB d
edB ddfddZ	ddeeeef B dB d
edB ddfddZ	ddejdB d
edB ddfddZddedB d
edB ddfddZ	ddedB d
edB ddfddZddedB d
edB ddfddZ  ZS ) r1  Ntrackerr   r   c                    6   t    || _tjtj|j|j|j	d| _
|| _d S N)r   r   r   r   r   _trackerr6   loadr)   VLLM_MEDIA_CONNECTORr   r   r   
_connector_mm_processor_kwargsr   rH  r   r   rJ   rK   r     s   

z MultiModalContentParser.__init__c                 C   r   r   rL  rp   r   rJ   rJ   rK   rp   '  r   z$MultiModalContentParser.model_configru   rd   c                 C   6   |r| j |nd }| jd||f}| d| d S NrQ   )rO  fetch_imagerL  r   r=  )r   ru   rd   rQ   r:  rJ   rJ   rK   r?  +     z#MultiModalContentParser.parse_imagerc   c                    s    j  }|jstdt|tr% fdd| D } jd||f}t|t	r9 j
|} jd||f}|d u rF jdd |f} d| d S )N9You must set `--enable-mm-embeds` to input `image_embeds`c                       i | ]\}}| j |qS rJ   rO  fetch_image_embeddingr   kvr   rJ   rK   r   =      z>MultiModalContentParser.parse_image_embeds.<locals>.<dictcomp>rc   rQ   )rp   get_multimodal_configr   r   r   rf   r   rL  r   rX   rO  rZ  r=  )r   rc   rd   r   embedsr:  	embeddingrJ   r   rK   r@  1  s    



z*MultiModalContentParser.parse_image_embedsri   c                    s    j  }|jstdt|tr& fdd| D } jd||f}nt|t	r; j
|} jd||f}n	 jdd |f} d| d S )N9You must set `--enable-mm-embeds` to input `audio_embeds`c                    rX  rJ   rO  fetch_audio_embeddingr[  r   rJ   rK   r   X  r^  z>MultiModalContentParser.parse_audio_embeds.<locals>.<dictcomp>ri   rR   )rp   r_  r   r   r   rf   r   rL  r   rX   rO  rd  r=  )r   ri   rd   r   r`  r:  ra  rJ   r   rK   rF  L  s   



z*MultiModalContentParser.parse_audio_embedsrn   c                 C   s"   | j d||f}| d| d S rT  )rL  r   r=  )r   rn   rd   r:  rJ   rJ   rK   rB  e  s   z'MultiModalContentParser.parse_image_pilr^   c                 C   rS  NrR   )rO  fetch_audiorL  r   r=  )r   r^   rd   rR   r:  rJ   rJ   rK   rC  k  rV  z#MultiModalContentParser.parse_audiorD  c                 C   H   |r| dd}| dd}|rd| d| }nd }nd }| ||S Nr  r   formatzdata:audio/z;base64,r%  rC  r   rD  rd   
audio_dataaudio_formatr^   rJ   rJ   rK   rE  q     z)MultiModalContentParser.parse_input_audiorl   c                 C   s   |r	| j j|dnd }| jd||f}| d| |rA| jrC| jddrE|r.| j |nd }| jd||f}| d| d S d S d S d S )N)rl   rS   use_audio_in_videoFrR   )rO  fetch_videorL  r   r=  rP  r%  rf  )r   rl   rd   rS   r:  rR   audio_placeholderrJ   rJ   rK   rG    s   z#MultiModalContentParser.parse_videor   )rI   rN   rO   r+  rf   rX   r   r   r  r*   rp   r?  r@  rF  r#   rB  rC  r    rE  rG  r  rJ   rJ   r   rK   r1    s\     	


 
(r1  c                       s  e Zd Z	d%dedeeef dB ddf fddZede	fddZ
d	edB d
edB fddZd%d	edB d
edB ddfddZ	d%deeeef B dB d
edB ddfddZ	d%deeeef B dB d
edB ddfddZ	d%dejdB d
edB ddfddZdedB d
edB fddZd%dedB d
edB ddfddZ	d%dedB d
edB ddfddZd edB d
edB fd!d"Zd%d edB d
edB ddfd#d$Z  ZS )&r8  NrH  r   r   c                    rI  rJ  rK  rQ  r   rJ   rK   r     s   

z%AsyncMultiModalContentParser.__init__c                 C   r   r   rR  r   rJ   rJ   rK   rp     r   z)AsyncMultiModalContentParser.model_configru   rd   c                    $   |r| j |I d H nd }||fS r   )rO  fetch_image_async)r   ru   rd   rQ   rJ   rJ   rK   _image_with_uuid_async     z3AsyncMultiModalContentParser._image_with_uuid_asyncc                 C   *   |  ||}| jd|}| d| d S rT  )rt  rL  r   r=  )r   ru   rd   coror:  rJ   rJ   rK   r?       z(AsyncMultiModalContentParser.parse_imagerc   c                        j  }|jstdtjttjt	t
tjf B d B t
d B f   }t|t	r9 fdd| D }|||f t|t
rK j|}|||f |d u rV|d |f  jd|} d| d S )NrW  c                    rX  rJ   rY  r[  r   rJ   rK   r     r^  zCAsyncMultiModalContentParser.parse_image_embeds.<locals>.<dictcomp>rc   rQ   )rp   r_  r   r   r4  Futurer  r>   r   rf   rX   r   r   
set_resultrO  rZ  rL  r   r=  )r   rc   rd   r   futurer`  ra  r:  rJ   r   rK   r@    (   
"


z/AsyncMultiModalContentParser.parse_image_embedsri   c                    ry  )Nrb  c                    rX  rJ   rc  r[  r   rJ   rK   r     r^  zCAsyncMultiModalContentParser.parse_audio_embeds.<locals>.<dictcomp>ri   rR   )rp   r_  r   r   r4  rz  r  r>   r   rf   rX   r   r   r{  rO  rd  rL  r   r=  )r   ri   rd   r   r|  r`  ra  r:  rJ   r   rK   rF    r}  z/AsyncMultiModalContentParser.parse_audio_embedsrn   c                 C   s^   t jttjd B td B f   }|r|||f n|d |f | jd|}| d| d S rT  )	r4  rz  r  r#   rX   r{  rL  r   r=  )r   rn   rd   r|  r:  rJ   rJ   rK   rB    s   z,AsyncMultiModalContentParser.parse_image_pilr^   c                    rr  r   )rO  fetch_audio_async)r   r^   rd   rR   rJ   rJ   rK   _audio_with_uuid_async  ru  z3AsyncMultiModalContentParser._audio_with_uuid_asyncc                 C   rv  re  )r  rL  r   r=  )r   r^   rd   rw  r:  rJ   rJ   rK   rC    rx  z(AsyncMultiModalContentParser.parse_audiorD  c                 C   rg  rh  rj  rk  rJ   rJ   rK   rE    rn  z.AsyncMultiModalContentParser.parse_input_audiorl   c                    rr  r   )rO  fetch_video_async)r   rl   rd   rS   rJ   rJ   rK   _video_with_uuid_async  ru  z3AsyncMultiModalContentParser._video_with_uuid_asyncc                 C   st   |  ||}| jd|}| d| |r4| jr6| jddr8| ||}| jd|}| d| d S d S d S d S )NrS   ro  FrR   )r  rL  r   r=  rP  r%  r  )r   rl   rd   rw  r:  
audio_cororq  rJ   rJ   rK   rG  #  s   z(AsyncMultiModalContentParser.parse_videor   )rI   rN   rO   r3  rf   rX   r   r   r  r*   rp   rt  r?  r@  rF  r#   rB  r  rC  r    rE  r  rG  r  rJ   rJ   r   rK   r8    sb     	
#
#
 
(r8  c                   @   s6   e Zd ZU dZedB ed< dZeed< dZe	ed< dS )ChatTemplateConfigNchat_templater   chat_template_content_formatFtrust_request_chat_template)
rI   rN   rO   r  rX   rY   r  ChatTemplateContentFormatOptionr  r|   rJ   rJ   rJ   rK   r  4  s   
 r  r  c                    s    du rdS t  tr  stdt  trLd}t fdd|D sFt  sHddlm} |  }| sJtd  d	  d
| dS dS dS t	t
  d)z5Raises if the provided chat template appears invalid.Nz-the supplied chat template path doesn't exist{}
c                 3       | ]}| v V  qd S r   rJ   r   cr  rJ   rK   r   F  r   z)validate_chat_template.<locals>.<genexpr>r   CHAT_TEMPLATES_DIRz#The supplied chat template string (z/) appears path-like, but doesn't exist! Tried:  and z" is not a valid chat template type)r   r   existsFileNotFoundErrorrX   r   /vllm.transformers_utils.chat_templates.registryr  r   	TypeErrorr_   )r  JINJA_CHARSr  builtin_template_pathrJ   r  rK   validate_chat_template;  s.   


r  
is_literalr  c                   sB   d u rd S |rt  trtd S zt }| W  d    W S 1 s(w   Y  W d S  ty } zet  tr= d}t fdd|D sddlm} |  }z!t|}| W  d    W W  Y d }~S 1 sow   Y  W n ty   d  d  d	| d
| }t	||w t
 ddW  Y d }~S d }~ww )Nz<chat_template is expected to be read directly from its valuer  c                 3   r  r   rJ   r  r  rJ   rK   r   r  r   z&_load_chat_template.<locals>.<genexpr>r   r  zThe supplied chat template (z=) looks like a file path, but it failed to be opened. Tried: r  z
. Reason: Tr  )r   r   r  openreadOSErrorr   r  r  r   _load_chat_template)r  r  fr   r  r  r  msgrJ   r  rK   r  Z  sJ   

(

2
r  c                C   r.  )Nr  )_cached_load_chat_template)r  r  rJ   rJ   rK   load_chat_template  s   r  placeholder_storagetextsc                 C   s6   t |D ]\}}|| v r| | d||< qd|S )Nr   
)r  popjoin)r  r  idxelemrJ   rJ   rK   _get_interleaved_text_prompt  s
   
r  interleave_stringsc                 C   s   t dd |  D }|rt| |}nd|}g }|D ]1}||  ||8  < || dk rCtd| td| td| d|	|g||   q|rXd||g S d|S )	z;Combine multimodal prompts for a multimodal language model.c                 S   s   g | ]	}|D ]}|qqS rJ   rJ   )r   r  r]  rJ   rJ   rK   r     s    z4_get_full_multimodal_text_prompt.<locals>.<listcomp>r  r   zPlaceholder count is negative! Ensure that the 'interleave_strings' flag is disabled (current value: %s) when manually placing image placeholders.zInput prompt: %szFound more 'zA' placeholders in input prompt than actual multimodal data items.)
r   valuesr  r  countr   errordebugr   extend)r  r  r  placeholder_countstext_promptmissing_placeholdersr:  rJ   rJ   rK    _get_full_multimodal_text_prompt  s,   


r  _ContentPartc                 C      t | dd S Ntext_TextParserr%  partrJ   rJ   rK   <lambda>      r  c                 C   r  )Nrz   )_ThinkParserr%  r  rJ   rJ   rK   r    r  c                 C   r  r  r  r  rJ   rJ   rK   r    r  c                 C   r  r  r  r  rJ   rJ   rK   r    r  c                 C   r  )Nru   )_ResponsesInputImageParserr%  r  rJ   rJ   rK   r    r  c                 C      t | di dd S )Nru   rV   )_ImageParserr%  r  rJ   rJ   rK   r    r   c                 C   r  )Nrc   )_ImageEmbedsParserr%  r  rJ   rJ   rK   r    r  c                 C   r  )Nri   )_AudioEmbedsParserr%  r  rJ   rJ   rK   r    r  c                 C   r  )Nrn   )_PILImageParserr%  r  rJ   rJ   rK   r    r  c                 C   r  )Nr^   rV   )_AudioParserr%  r  rJ   rJ   rK   r    r   c                 C   r  )NrD  )_InputAudioParserr%  r  rJ   rJ   rK   r    r  c                 C   r  )Nrefusal)_RefusalParserr%  r  rJ   rJ   rK   r    r  c                 C   r  )Nrl   rV   )_VideoParserr%  r  rJ   rJ   rK   r    r   )r  rz   
input_textoutput_textinput_imageru   rc   ri   rn   r^   rD  r  rl   MM_PARSER_MAPr  c                 C   s  t | tsJ | dd}| dd}t |tr;|tv r;|du r;t| | }|dkr7| dddkr7td ||fS |du sC|durd| v ratt| }|dd}t |tr]|dd}d|fS d	| v rttt	| }|d	d}d	|fS d
| v rtt
| }|d
d}d
|fS d| v rtt| }|dd}	d|	fS d| v rtt| }|dd}
t |
tr|
dd}
d|
fS | ddurttttf | }d|fS d| v rtt| }|dd}t |tr|dd}d|fS tdt |tstd|dfS )a  
    Parses a given multi-modal content part based on its type.

    Args:
        part: A dict containing the content part, with a potential 'type' field.

    Returns:
        A tuple (part_type, content) where:
        - part_type: Type of the part (e.g., 'text', 'image_url').
        - content: Parsed content (e.g., text, image URL).

    Raises:
        ValueError: If the 'type' field is missing and no direct URL is found.
    r_   Nrd   ru   detailr   zB'image_url.detail' is currently not supported and will be ignored.rV   rn   rc   ri   r^   rD  rl   z(Missing 'type' field in multimodal part.z(Invalid 'type' field in multimodal part.zunknown part_type content)r   rf   r%  rX   r  r   r   r   rt   rq   rb   rh   rw   rx   r   )r  	part_typerd   r   image_paramsru   rn   rc   audio_paramsri   r^   input_audio_paramsvideo_paramsrl   rJ   rJ   rK   #_parse_chat_message_content_mm_part  st   





r  )r  r  r0  r~   parts
mm_tracker
wrap_dictsr   c                C   s   t t  }|j|d}|D ]}t||||d}	|	r||	 q|r(t| |dgS tt t |}
| }|r<t	||
|}nd
|
}t| |dgS )Nr0  )r  r  )r~   r   r  )r   r  r   _parse_chat_message_content_partr   r   r   rX   r>  r  r  )r~   r  r  r  r  r   r   	mm_parserr  	parse_resr  r>  r  rJ   rJ   rK   !_parse_chat_message_content_partsb  s,   
	

r  r  c                C   s  t | tr|rd| dS | S t| \}}|tv r%|du r%td| | dS |dv r7tt|}|r5d|dS |S | dd}|durEt|}d}|dkr`|durUttj|nd}	|	|	| d}n|d	v rrtt|}|
|| d}ny|d
kr|durtttttf B |nd}||| d}n[|dkr|durtttttf B |nd}||| d}n=|dkrtt|}||| d}n+|dkrtt|}
||
| d}n|dkrtt|}||| d}ntd| |rd|iS |rt| S dS )a|  Parses a single part of a conversation. If wrap_dicts is True,
    structured dictionary pieces for texts and images will be
    wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
    {"type": "image"}, respectively. Otherwise multimodal data will be
    handled by mm_parser, and texts will be returned as strings to be joined
    with multimodal placeholders.
    r  r_   r  NzKSkipping multimodal part '%s' (type: '%s') with empty / unparsable content.)r  r  r  r  rz   rd   rn   rQ   )ru   r  rc   ri   rR   r^   rD  rl   rS   zUnknown part type: r_   )r   rX   r  PART_TYPES_TO_SKIP_NONE_CONTENTr   r   r   r%  r#   rB  r?  rf   r@  rF  rC  r    rE  rG  r   r;  )r  r  r  r  r  r   str_contentrd   r   image_contentdict_contentrJ   rJ   rK   r    sj   




""


r  messagecontent_formatc                 C   s   | d }|  d}|  d}|d u rg }nt|tr!td|dg}t||||dk||d}|D ]^}	|dkr^t| }
d	|
v rK|
d	 d urKt|
d	 |	d	< |d ur]tt||	d< tt||	d
< n|dkrpt| }
d|
v rp|
d |	d< d| v rt| d tr| d |	d< |dkr|  dd |	d< q/|S )Nr~   r   r   r  r  r   )r  r  r   	assistantr   r   toolr   r?   	developerr   )	r%  r   rX   r   r  _AssistantParserr   r   _ToolParser)r  r  r  r  r   r~   r   r   result
result_msg
parsed_msgrJ   rJ   rK   _parse_chat_message_content  sH   


	r  messagesc                 C   s   | D ]I}|d dkrKd|v rK| d}t|tsqt|dkr&|dd  q|D ]"}|d  d }rDt|ttfsCt||d d< q(i |d d< q(qd S )Nr~   r  r   r   function	arguments)r%  r   r   r   r  rf   jsonloads)r  r  r   r   r   rJ   rJ   rK   _postprocess_messages  s    

r  rp   r   c                 C   sj   g }t ||d}| D ]}t||||dko|jd uo|jj|d}|| q
t| | \}	}
||	|
fS N)r   r   )r  r   )r+  r  r   interleave_mm_stringsr  r  r-  r  rp   r  r   r   conversationr  r  sub_messagesr&  r'  rJ   rJ   rK   parse_chat_messages,  s"   
r  c                    sr   g }t ||d}| D ]}t||||dko|jd uo|jj|d}|| qt| | I d H \}	}
||	|
fS r  )r3  r  r   r  r  r  r-  r  rJ   rJ   rK   parse_chat_messages_asyncP  s(   
r  r  c                 C   sD   d}| D ]}|d dkr| d}||d urtt|nd7 }q|S )Nr   r~   r  r   )r%  r   r   )r  r  r  r   rJ   rJ   rK   get_history_tool_calls_cntv  s   
r  randomid_typec                 C   s$   | dkrd| d| S dt   S )Nkimi_k2z
functions.:zchatcmpl-tool-r9   )r  	func_namer  rJ   rJ   rK   make_tool_call_id  s   r  r   r,  )r  NN)r4  r  rE   abcr   r   collectionsr   r   collections.abcr   r   r   dataclassesr	   	functoolsr
   r   r   	itertoolsr   pathlibr   typingr   r   r   r   r   r   r   openai.types.chatr   r   r   r   r   r   r   r   r   $OpenAIChatCompletionContentPartParamr    OpenAIChatCompletionMessageParam@openai.types.chat.chat_completion_content_part_input_audio_paramr    openai.types.responsesr!   openai_harmonyr"   OpenAIHarmonyMessagePILr#   pydanticr$   r%   r&   typing_extensionsr'   r(   vllmr)   vllm.configr*   vllm.loggerr+   vllm.model_executor.modelsr,   vllm.multimodalr-   r.   r/   vllm.multimodal.inputsr0   r1   r2   r3   r4   r5   vllm.multimodal.mediar6   r7   vllm.multimodal.processingr8   
vllm.utilsr:   vllm.utils.collection_utilsr;   vllm.utils.import_utilsr<   r>   r=   globalsrI   r   rX   rL   r   rM   r;  rU   r]   rb   rh   rj   rk   rm   rq   rt   rw   rx   ry   rY   r}   r   r  ChatTemplateContentFormatr  r   r   r   r   r   rf   r   r   r   r  objectr!  r)  r+  r3  r   r1  r8  r  r  r|   r  r  r  r  r  r  r  r  r  r  r  r  validate_pythonr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rJ   rJ   rJ   rK   <module>   sR  
$(
 
			




3
s
>

 1
=~ #"

0






7









 

`
&

R

4
(
&	