o
    :/i[                  	   @   s  U d dl mZmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlmZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d	d
lmZmZm Z  d	dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, d	dl-m.Z. edZ/edZ0erd dl1m2Z3 nede4 dZ3G dd deee/e0f Z5G dd de5e	e/ e/f Z6	d:dej7de8de9dB ddfddZ:G dd de5ej7e;ej7 B ej7f Z<G dd de5ee8ej7f ee8ej7f f Z=G dd  d e6e#dB  Z>G d!d" d"e<Z?G d#d$ d$eZ@G d%d& d&e6e$dB  ZAG d'd( d(e<ZBG d)d* d*e6e%dB  ZCG d+d, d,e<ZDG d-d. d.e6e ZEed/e5eef d0ZFG d1d2 d2ee8e5eef f ZGee'e ge5eef dB f ZHeeId3< G d4d5 d5ZJeKe8e	e8dB  f ZLeeId6< 	 d7e+dB deLfd8d9ZMdS );    )ABCabstractmethod)UserDict)CallableIteratorMappingSequenceSet)TYPE_CHECKINGAnyGenericLiteral
NamedTuple	TypeAlias	TypeGuardTypeVarN)assert_never)
is_list_of)
LazyLoader   )AudioResampler	AudioSpecnormalize_audio)	AudioItemHfAudioItemHfImageItemHfVideoItem	ImageItemModalityDataMultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsMultiModalUUIDDict	VideoItem)MediaWithBytes_T_IPILImagez	PIL.Imagec                       s  e Zd ZdZdededdf fddZdefdd	Zdefd
dZ	dede
fddZer5dee
 fddZedefddZedede
fddZdee
 fddZdedefddZdee fddZedeeef fddZedeeef fddZ  ZS )ModalityDataItemszy
    Represents data items for a modality in
    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
    datamodalityreturnNc                    s   t    || _|| _d S N)super__init__r)   r*   )selfr)   r*   	__class__ b/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/multimodal/parse.pyr.   8   s   

zModalityDataItems.__init__c                 C   s"   t | j d| jdt|  dS )Nz
(modality=z, len=))type__name__r*   lenr/   r2   r2   r3   __repr__>   s   "zModalityDataItems.__repr__c                 C   s   |   S r,   	get_countr8   r2   r2   r3   __len__A   s   zModalityDataItems.__len__indexc                 C   
   |  |S r,   getr/   r=   r2   r2   r3   __getitem__D      
zModalityDataItems.__getitem__c                 C   s   d S r,   r2   r8   r2   r2   r3   __iter__I   s    zModalityDataItems.__iter__c                 C      t )zGet the number of data items.NotImplementedErrorr8   r2   r2   r3   r;   K      zModalityDataItems.get_countc                 C   rE   )zGet a data item by its index.rF   rA   r2   r2   r3   r@   P   rH   zModalityDataItems.getc                        fddt   D S )zGet all data items.c                       g | ]}  |qS r2   r?   .0idxr8   r2   r3   
<listcomp>W       z-ModalityDataItems.get_all.<locals>.<listcomp>ranger;   r8   r2   r8   r3   get_allU   s   zModalityDataItems.get_allc                 C   r>   r,   r?   rA   r2   r2   r3   get_item_for_hashY   rC   z#ModalityDataItems.get_item_for_hashc                    rI   )Nc                    rJ   r2   )rS   rK   r8   r2   r3   rN   ]   rO   z<ModalityDataItems.get_all_items_for_hash.<locals>.<listcomp>rP   r8   r2   r8   r3   get_all_items_for_hash\   s   z(ModalityDataItems.get_all_items_for_hashc                 C   rE   )z)Get the data to pass to the HF processor.rF   r8   r2   r2   r3   get_processor_data_   rH   z$ModalityDataItems.get_processor_datac                 C   rE   )z+Get the data to pass directly to the model.rF   r8   r2   r2   r3   get_passthrough_datad   rH   z&ModalityDataItems.get_passthrough_data)r6   
__module____qualname____doc__r%   strr.   r9   intr<   r&   rB   r
   r   rD   r   r;   r@   listrR   objectrS   rT   r   rU   rV   __classcell__r2   r2   r0   r3   r(   2   s&     r(   c                   @   s   e Zd ZdZdeee B defddZdefddZdedefd	d
Z	dedeee B fddZ
deeef fddZdeeef fddZdS )ProcessorBatchItemsz6Base class for data items that are arranged in a list.itemr+   c                 C      t |tr|jS |S z&Extract media from wrapper if present.
isinstancer$   mediar/   r`   r2   r2   r3   _unwrapm   s   zProcessorBatchItems._unwrapc                 C   
   t | jS r,   r7   r)   r8   r2   r2   r3   r;   q   rC   zProcessorBatchItems.get_countr=   c                 C      |  | j| S r,   rg   r)   rA   r2   r2   r3   r@   t      zProcessorBatchItems.getc                 C   s
   | j | S r,   r)   rA   r2   r2   r3   rS   w   s   
z%ProcessorBatchItems.get_item_for_hashc                 C   s   | j  d|  iS )Ns)r*   rR   r8   r2   r2   r3   rU   {      z&ProcessorBatchItems.get_processor_datac                 C      i S r,   r2   r8   r2   r2   r3   rV   ~      z(ProcessorBatchItems.get_passthrough_dataN)r6   rW   rX   rY   r%   r$   rg   r[   r;   r@   rS   r   rZ   r]   rU   rV   r2   r2   r2   r3   r_   j   s    r_   tensorr*   r=   r+   c              	   C   sZ   | j dk s
| j dkr+|durd| dnd}t|  d| d| j  d	t| j dS )
a  Validate tensor ndim for multimodal embeddings.

    Single embeddings should be 2D (seq_len, hidden_size).
    Batched embeddings should be 3D (batch, seq_len, hidden_size).

    Args:
        tensor: The tensor to validate.
        modality: The modality name for error messages (e.g., "image", "audio").
        index: Optional index for list items, included in error messages.
          Nz [] z
 embeddingzL must be 2D (seq_len, hidden_size) or 3D (batch, seq_len, hidden_size), got D tensor with shape )ndim
ValueError
capitalizetupleshape)rr   r*   r=   idx_strr2   r2   r3   validate_embedding_ndim   s   r~   c                	       s   e Zd ZdZ	ddejeej B dededB ddf fddZ	dd	d
Z
deddfddZdejeej B dejfddZdefddZdedejfddZdeeef fddZdeeef fddZdedefddZ  ZS )EmbeddingItemsz
    Base class for data items that are expressed as a batched embedding tensor,
    or a list of embedding tensors (one per item).
    Nr)   r*   expected_hidden_sizer+   c                    s0   t  || |   |d ur| | d S d S r,   )r-   r.   _validate_ndim_validate_hidden_size)r/   r)   r*   r   r0   r2   r3   r.      s
   zEmbeddingItems.__init__c              
   C   sn   t | jtjrt| j| j dS t| jD ]\}}|jdkr4t| j	  d| d|j dt
|j qdS )z=Validate that embedding tensors have correct ndim (2D or 3D).rs    embedding [z)] must be 2D (seq_len, hidden_size), got rw   N)rd   r)   torchTensorr~   r*   	enumeraterx   ry   rz   r{   r|   )r/   rM   rr   r2   r2   r3   r      s   
zEmbeddingItems._validate_ndimc                 C   s   t | jtjr)| jjd }||kr't| j  d| d| dt| jj dS t	| jD ]%\}}|jd }||krSt| j  d| d| d| dt|j 	q.dS )a8  Validate that embedding hidden dimension matches expected size.

        This validates hidden dimensions to prevent vulnerabilities: Embeddings
        with correct ndim but wrong hidden dimension could bypass initial
        checks and cause crashes during model inference when dimensions don't match.
        z* embedding hidden dimension mismatch: got z, but model expects z. Embedding shape: r   z!] hidden dimension mismatch: got N)
rd   r)   r   r   r|   ry   r*   rz   r{   r   )r/   r   actual_hidden_sizerM   rr   r2   r2   r3   r      s6   

z$EmbeddingItems._validate_hidden_sizer`   c                 C   ra   rb   rc   rf   r2   r2   r3   rg      s   zEmbeddingItems._unwrapc                 C   rh   r,   ri   r8   r2   r2   r3   r;      rC   zEmbeddingItems.get_countr=   c                 C   rj   r,   rk   rA   r2   r2   r3   r@      rl   zEmbeddingItems.getc                 C   rp   r,   r2   r8   r2   r2   r3   rU      rq   z!EmbeddingItems.get_processor_datac                 C   s   | j  d| jiS )N_embeds)r*   r)   r8   r2   r2   r3   rV         z#EmbeddingItems.get_passthrough_dataitem_idxc                 C   s   t | |S r,   )r7   r@   )r/   r   r2   r2   r3   get_feature_size   s   zEmbeddingItems.get_feature_sizer,   )r+   N)r6   rW   rX   rY   r   r   r\   rZ   r[   r.   r   r   r$   rg   r;   r@   r   r]   rU   rV   r   r^   r2   r2   r0   r3   r      s0    	

r   c                       s   e Zd ZdZdeeejf dedee de	eeejf geee
f f ddf
 fdd	Zdefd
dZdedeeejf fddZdeeef fddZdeeef fddZ  ZS )DictEmbeddingItemsz
    Base class for data items that are expressed as a dictionary of tensors.

    Usually, the dictionary keys correspond to the outputs of HF processor.
    r)   r*   required_fieldsfields_factoryr+   Nc                    s   ddl m} t || ||  }|r't| }d| d| }t|||}	||	  }
|
rEt|	 }d|d|}t||	| _|| _t	
|t||	| _d S )Nr   )BatchFeaturez$The data should contain the fields: z%, but only found the following keys: zrequired_fields=z should be a subset of fields=)%transformers.feature_extraction_utilsr   r-   r.   keyssetry   fields_configr   r!   from_hf_inputsdict_kwargs)r/   r)   r*   r   r   r   missing_required_data_keys	data_keysmsgr   missing_required_fieldsfieldsr0   r2   r3   r.      s,   


zDictEmbeddingItems.__init__c                 C   s   t | j| j S r,   )r7   r   r*   r8   r2   r2   r3   r;      rl   zDictEmbeddingItems.get_countr=   c                 C   s   | j | j |  S r,   )r   r*   get_datarA   r2   r2   r3   r@   #  ro   zDictEmbeddingItems.getc                 C   rp   r,   r2   r8   r2   r2   r3   rU   &  rq   z%DictEmbeddingItems.get_processor_datac                 C   s   | j S r,   rm   r8   r2   r2   r3   rV   )  s   z'DictEmbeddingItems.get_passthrough_data)r6   rW   rX   rY   r   rZ   r   r   r   r   r    r.   r[   r;   r@   r]   rU   rV   r^   r2   r2   r0   r3   r      s*    
	&r   c                       s@   e Zd ZdeedB  ddf fddZdedefddZ  ZS )	AudioProcessorItemsr)   Nr+   c                       t  |d d S Naudior-   r.   r/   r)   r0   r2   r3   r.   .  r   zAudioProcessorItems.__init__r   c                 C   (   |  |}|d u rtd| t|S )Nz%Cannot get length of cached audio at r@   ry   r7   )r/   r   r   r2   r2   r3   get_audio_length1     
z$AudioProcessorItems.get_audio_length)	r6   rW   rX   r   r   r.   r[   r   r^   r2   r2   r0   r3   r   -      r   c                       >   e Zd Z	ddejeej B dedB ddf fddZ  ZS )AudioEmbeddingItemsNr)   r   r+   c                       t  |d| d S r   r   r/   r)   r   r0   r2   r3   r.   :     zAudioEmbeddingItems.__init__r,   	r6   rW   rX   r   r   r\   r[   r.   r^   r2   r2   r0   r3   r   9      r   c                   @   s   e Zd ZU eed< eed< dS )	ImageSizewidthheightN)r6   rW   rX   r[   __annotations__r2   r2   r2   r3   r   B  s   
 r   c                       s@   e Zd ZdeedB  ddf fddZdedefddZ  Z	S )	ImageProcessorItemsr)   Nr+   c                    r   Nimager   r   r0   r2   r3   r.   H  r   zImageProcessorItems.__init__r   c                 C   sj   |  |}|d u rtd| t|tjrt|j S t|tjt	j
fr/|j\}}}t||S t| d S )Nz#Cannot get size of cached image at )r@   ry   rd   r'   Imager   sizenpndarrayr   r   r|   r   )r/   r   r   _hwr2   r2   r3   get_image_sizeK  s   


z"ImageProcessorItems.get_image_size)
r6   rW   rX   r   r   r.   r[   r   r   r^   r2   r2   r0   r3   r   G  r   r   c                       r   )ImageEmbeddingItemsNr)   r   r+   c                    r   r   r   r   r0   r2   r3   r.   Z  r   zImageEmbeddingItems.__init__r,   r   r2   r2   r0   r3   r   Y  r   r   c                	       sz   e Zd Z	ddeedB  deeef eeeef dB  B dB ddf fddZ	de
de
fdd	Zde
defd
dZ  ZS )VideoProcessorItemsNr)   metadatar+   c                    s   t  |d || _d S Nvideo)r-   r.   r   )r/   r)   r   r0   r2   r3   r.   c  s   
zVideoProcessorItems.__init__r   c                 C   r   )Nz%Cannot get length of cached video at r   )r/   r   r   r2   r2   r3   get_num_framesl  r   z"VideoProcessorItems.get_num_framesc                 C   s   |  |}|d u rtd| t|dkrtd| |d }t|tjr,t|j S t|tj	t
jfr@|j\}}}t||S t| d S )Nz#Cannot get size of cached video at r   z"Cannot get size of empty video at )r@   ry   r7   rd   r'   r   r   r   r   r   r   r   r|   r   )r/   r   r   r   r   r   r   r2   r2   r3   get_frame_sizes  s   


z"VideoProcessorItems.get_frame_sizer,   )r6   rW   rX   r   r   r   rZ   r   r\   r.   r[   r   r   r   r^   r2   r2   r0   r3   r   b  s    
"	r   c                       r   )VideoEmbeddingItemsNr)   r   r+   c                    r   r   r   r   r0   r2   r3   r.     r   zVideoEmbeddingItems.__init__r,   r   r2   r2   r0   r3   r     r   r   c                       s.   e Zd ZdZdee ddf fddZ  ZS )VisionChunkProcessorItemszCProcessor items for vision chunks (unified image and video chunks).r)   r+   Nc                    r   )Nvision_chunkr   r   r0   r2   r3   r.     r   z"VisionChunkProcessorItems.__init__)r6   rW   rX   rY   r   r   r.   r^   r2   r2   r0   r3   r     s    "r   _D)boundc                   @   s~   e Zd ZdZdee fddZdddeded	efd
dZ	d	e
eef fddZdedee eee df B d	efddZdS )MultiModalDataItemsz
    As [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict], but
    normalized such that each entry corresponds to a list.
    
modalitiesc                    s   t  fdd|D S )zq
        Construct a new `MultiModalDataItems` instance containing only the
        selected modalities.
        c                    s   i | ]}| | qS r2   r2   )rL   r*   r8   r2   r3   
<dictcomp>  rO   z.MultiModalDataItems.select.<locals>.<dictcomp>)r   )r/   r   r2   r8   r3   select  s   zMultiModalDataItems.selectT)strictr*   r   r+   c                C   s<   || vr|rt |  }td|d| dS | |  S )z
        Get the number of data items belonging to a modality.

        If `strict=False`, return `0` instead of raising [`KeyError`][]
        even if the modality is not found.
        	Modality " not found. Available modalities: r   )r   r   KeyErrorr;   )r/   r*   r   available_modalitiesr2   r2   r3   r;     s   zMultiModalDataItems.get_countc                 C   s   dd |   D S )z3Get the number of items belonging to each modality.c                 S   s   i | ]	\}}||  qS r2   r:   )rL   mitemsr2   r2   r3   r     s    z6MultiModalDataItems.get_all_counts.<locals>.<dictcomp>r   r8   r2   r2   r3   get_all_counts  s   z"MultiModalDataItems.get_all_countstyp.c                 C   s\   || vrt |  }td|d| | | }t||s,td|d| dt| |S )zs
        Get the data items belonging to a modality,
        requiring that they belong to a certain type.
        r   r   z(Invalid type of data items for modality=z. Expected type: z, but found type: )r   r   r   rd   	TypeErrorr5   )r/   r*   r   r   r   r2   r2   r3   	get_items  s"   	
zMultiModalDataItems.get_itemsN)r6   rW   rX   rY   r	   rZ   r   boolr[   r;   r   r   r5   r   r{   r   r2   r2   r2   r3   r     s    	r   ModalityDataParserc                       sv  e Zd ZdZdddddddedB dedB ded	 d
ededB ddf fddZe	de
deejeej B  fddZdedeejedB f fddZdedeejeeef dB f fddZdee deeef dB fddZdee deeef dB fddZdee deeef dB fddZdee deeef dB fddZ de!ee"f fd d!Z#d"e$de%fd#d$Z&  Z'S )%MultiModalDataParsera  
    Parses [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
    into [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].

    Args:
        target_sr (float, optional): Enables automatic resampling of audio
            items to the model's expected sampling rate.
        target_channels (int, optional): Target number of audio channels.
            If provided, normalizes audio to this many channels (e.g., 1 for mono).
            If None, audio channels are passed through unchanged.
        expected_hidden_size (int, optional): Expected hidden dimension for
            embedding inputs. If provided, validates that user-supplied
            embeddings have the correct hidden size to prevent crashes
            during model inference.
    NlibrosaF)	target_srtarget_channelsaudio_resample_methodvideo_needs_metadatar   r   r   r   )r   scipyr   r   r+   c                   s.   t    t||d| _|| _|| _|| _d S )N)r   method)r-   r.   r   audio_resamplerr   r   r   )r/   r   r   r   r   r   r0   r2   r3   r.     s   
	
zMultiModalDataParser.__init__r)   c                 C   s@   t |tjr|jdkS t|tjrt|dkr|d jdkS dS )Nrt   r   rs   F)rd   r   r   rx   r   r7   )clsr)   r2   r2   r3   is_embeddings  s
   
z"MultiModalDataParser.is_embeddingsr   c                 C   ^   t |tr|S t |trt|d fS t |tjr|d fS t |tjr)| d fS t	| d S r,   
rd   r{   r\   r   arrayr   r   r   numpyr   )r/   r   r2   r2   r3   _get_audio_with_sr     

z'MultiModalDataParser._get_audio_with_srr   c                 C   r   r,   r   )r/   r   r2   r2   r3   _get_video_with_metadata  r   z-MultiModalDataParser._get_video_with_metadatac           	      C   s   |d u rd S |  |rt|| jS t|trt|dks/t|tjt	j
fr*|jdks/t|tr3|g}nt|tjt	j
frDdd |D }n|}ttj  }|D ]-}| |\}}|d u r^|}n| jj||d}| jd urvt| jd}t||}|| qNt|S )Nr   r   c                 S      g | ]}|qS r2   r2   rL   elemr2   r2   r3   rN   =      z:MultiModalDataParser._parse_audio_data.<locals>.<listcomp>)orig_sr)r   )r   r   r   r   floatr7   rd   r   r   r   r   rx   r{   r\   r   r   resampler   r   r   appendr   )	r/   r)   
data_items
new_audios	data_itemr   r   	new_audiospecr2   r2   r3   _parse_audio_data+  s6   



z&MultiModalDataParser._parse_audio_datac                 C   s   |d u rd S |  |rt|| jS t|tjtfs't|tjt	j
fr.|jdkr.|g}t|S t|tjt	j
frBdd |D }t|S |}t|S )Nrt   c                 S   r   r2   r2   r   r2   r2   r3   rN   a  r   z:MultiModalDataParser._parse_image_data.<locals>.<listcomp>)r   r   r   rd   r'   r   r$   r   r   r   r   rx   r   )r/   r)   r   r2   r2   r3   _parse_image_dataR  s   

z&MultiModalDataParser._parse_image_datac                 C   s.  |d u rd S |  |rt|| jS t|tjrt|dks+t|tj	t
jfr/|jdkr/|g}n"t|tj	t
jfr@dd |D }nt|trOt|dkrO|g}n|}tttj	tttf d B f   }g }|D ]&}| |\}}| jr|d u rytd|||f || qe|| qe| jsd }t||dS )Nr      c                 S   r   r2   r2   r   r2   r2   r3   rN   w  r   z:MultiModalDataParser._parse_video_data.<locals>.<listcomp>rs   ziVideo metadata is required but not found in mm input. Please check your video input in `multi_modal_data`)r   )r   r   r   r   r'   r   r7   rd   r   r   r   r   rx   r{   r\   r   rZ   r   r   r   ry   r   r   )r/   r)   r   
new_videosmetadata_lstr   r   r   r2   r2   r3   _parse_video_datag  s:   

 z&MultiModalDataParser._parse_video_datac                 C   s6   |du rdS |  |rtdt|tr|g}t|S )z9Parse vision chunk data (unified image and video chunks).Nz8Do not support embedding data for vision_chunk right now)r   ry   rd   r   r   r   r2   r2   r3   _parse_vision_chunk_data  s   

z-MultiModalDataParser._parse_vision_chunk_datac                 C   s   | j | j| j| jdS )N)r   r   r   r   )r   r   r  r  r8   r2   r2   r3   _get_subparsers  s
   z$MultiModalDataParser._get_subparsersmm_datac                 C   sV   |   }t }| D ]\}}||vrtd| || | }d ur(|||< q|S )NzUnsupported modality: )r  r   r   ry   )r/   r  
subparsersmm_itemskvparsed_datar2   r2   r3   parse_mm_data  s   z"MultiModalDataParser.parse_mm_data)(r6   rW   rX   rY   r   r[   r   r   r.   classmethodr]   r   r   r   r\   r   r   r{   r   r   r   r#   r   rZ   r   r   r   r(   r   r   r   r  r  r   r   r  r   r   r
  r^   r2   r2   r0   r3   r     st    



'

*
r   MultiModalUUIDItemsmm_uuidsc                 C   s   | d u ri S dd |   D S )Nc                 S   s&   i | ]\}}|t |tr|gn|qS r2   )rd   rZ   )rL   r*   uuidsr2   r2   r3   r     s    z"parse_mm_uuids.<locals>.<dictcomp>r   )r  r2   r2   r3   parse_mm_uuids  s
   r  r,   )Nabcr   r   collectionsr   collections.abcr   r   r   r   r	   typingr
   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   vllm.utils.collection_utilsr   vllm.utils.import_utilsr   r   r   r   r   inputsr   r   r   r   r   r   r   r    r!   r"   r#   re   r$   r%   r&   	PIL.Imager   r'   globalsr(   r_   r   rZ   r[   r~   r\   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r2   r2   r2   r3   <module>   sj   (48


W<		#	 A [