o
    :/i8&                     @   sP  d dl Z d dlmZ d dlmZmZ d dlmZ d dlm	Z	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d	d
lmZ d	dlmZmZmZmZmZ d	dlmZmZm Z m!Z! e	rcd dl"Z#nede$ dZ#dddej%de&de'de'fddZ(dddej%de&de'de'fddZ)ddddejde'de'de'fddZ*ddddejde'de'de'fdd Z+d!dd"ej,de'de'fd#d$Z-d!dd"ej,de'de'fd%d&Z.d'ede/e0e'e&f  fd(d)Z1d*efd+d,Z2dd-d.d/ee d0e#j3j4d1e5fd2d3Z6dd-d.d/ee d0e#j3j4d1e5dee0e&ef  fd4d5Z7dd-d.d6e/e0e'ef  d0e#j3j4d1e5dee0e'e&ef ddf fd7d8Z8ed9dd-d.d6e/e0e'ef  d0e#j3j4d1e5dee0e'e&ef ddf fd:d;Z9	dHd<e'd=e:e'e
f dB de0ej%e&e;B f fd>d?Z<	dHd@e'dAe:e'e
f dB dejfdBdCZ=	dHdDe'dEe:e'e
f dB de0ej,e:e'e
f f fdFdGZ>dS )I    N)defaultdict)	GeneratorSequence)groupby)TYPE_CHECKINGAny)Image)
deprecated)
LazyLoader   )MultiModalHasher)BatchedTensorInputsMultiModalFieldElemMultiModalKwargsItemMultiModalPlaceholderDictMultiModalSharedField)AudioMediaIOImageMediaIOMediaConnectorVideoMediaIOtorchWAVformataudiosampling_rater   returnc                C   s   t  }|j| |f|dS )zEncode audio as base64.)audio_format)r   encode_base64)r   r   r   audio_io r    b/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/multimodal/utils.pyencode_audio_base64!   s   r"   c                C   4   t | ||d}tjd|  d}d| d| S )zEncode audio as a data URL.r   .r   data:;base64,)r"   	mimetypes	types_mapgetlower)r   r   r   	audio_b64mimetyper    r    r!   encode_audio_url,   s   r-   RGBPNG
image_moder   imager1   c                C   s   t |d}|j| |dS )z
    Encode a pillow image to base64 format.

    By default, the image is converted into RGB format before being encoded.
    )r1   )image_format)r   r   )r2   r1   r   image_ior    r    r!   encode_image_base648   s   
r5   c                C   r#   )z|
    Encode a pillow image as a data URL.

    By default, the image is converted into RGB format before being encoded.
    r0   r$   r2   r%   r&   )r5   r'   r(   r)   r*   )r2   r1   r   	image_b64r,   r    r    r!   encode_image_urlG   s   r7   JPEGframesc                C   s   t  }t|}|j| |dS )N)video_format)r   r   r   )r9   r   r4   video_ior    r    r!   encode_video_base64W   s   r<   c                C   sD   t | |d}| dkrd}ntjd|  d}d| d| S )Nr   jpegz
video/jpegr$   videor%   r&   )r<   r*   r'   r(   r)   )r9   r   	video_b64r,   r    r    r!   encode_video_urla   s
   r@   mm_positionsc                 C   s0   dd |   D }t|dd d}dd |D S )a/  
    Given a `MultiModalPlaceholderDict`, output a sequence of keys to
    sort the dictionary by `offset` (starting index in the input sequence)
    in ascending order.

    Returns:
        A list of `(modality, idx)`, which can be used to access an item
        by `mm_positions[modality][idx]`.
    c                 s   s0    | ]\}}t |D ]
\}}|||fV  q
qd S N)	enumerate).0modalityitemsidxitemr    r    r!   	<genexpr>|   s    z'argsort_mm_positions.<locals>.<genexpr>c                 S   s
   | d j S )N   )offsetxr    r    r!   <lambda>   s   
 z&argsort_mm_positions.<locals>.<lambda>keyc                 S   s   g | ]	\}}}||fqS r    r    )rD   rE   rG   _r    r    r!   
<listcomp>   s    z(argsort_mm_positions.<locals>.<listcomp>)rF   sorted)rA   
flat_itemssorted_flat_itemsr    r    r!   argsort_mm_positionsp   s
   rV   elemc                 C   s   t | jtsd S tj| jdS )N)data)
isinstancefieldr   r   hash_kwargsrX   )rW   r    r    r!   _get_group_hash   s   r\   Fdevice
pin_memoryrF   r^   r_   c                   sV   t ttt f t}| D ]}| D ]\}}|| | qq fdd| D S )Nc                    s(   i | ]\}}||d  j j| dqS )r   r]   )rZ   reduce_data)rD   rP   elemsr]   r    r!   
<dictcomp>   s    z#_batch_mm_items.<locals>.<dictcomp>)r   strlistr   rF   append)rF   r^   r_   ra   rH   rP   rW   r    r]   r!   _batch_mm_items   s   rf   c                c   sp    dd | D }dd t |D }d}|D ]}t| |||  ||d}||fV  ||7 }q|t| ks6J dS )a  
    Group consecutive items (possibly from different requests) into batches.

    Items must be split across groups if any of the following occurs,
    as the batch would otherwise be invalid:
    - They have different fields (e.g. mixed image and embedding inputs).
    - They have different values in `MultiModalSharedField`.

    Args:
        items: List of `MultiModalKwargsItem`.
        device: The device to place the grouped tensors on.
        pin_memory: Whether to pin memory for faster host-to-device transfer.

    Yields:
        A tuple `(num_items, grouped_kwargs)`, where:
        - `kwargs` is a dictionary of keyword arguments to pass to the model;
        - `num_items` is the corresponding number of items.
    c                 S   s.   g | ]}t d d t| dd dD qS )c                 s   s     | ]\}}|t |fV  qd S rB   )r\   )rD   rP   rW   r    r    r!   rI      s
    

6group_and_batch_mm_items.<locals>.<listcomp>.<genexpr>c                 S      | d S Nr   r    )kvr    r    r!   rN          z5group_and_batch_mm_items.<locals>.<listcomp>.<lambda>rO   )tuplerS   rF   )rD   rH   r    r    r!   rR      s    z,group_and_batch_mm_items.<locals>.<listcomp>c                 S   s"   g | ]\}}t d d |D qS )c                 s   s    | ]}d V  qdS )r   Nr    )rD   rQ   r    r    r!   rI      s    rg   )sum)rD   rQ   groupr    r    r!   rR      s   " r   r]   N)r   rf   len)rF   r^   r_   	group_idsgroup_sizes	start_idx
group_size
group_datar    r    r!   group_and_batch_mm_items   s   

ru   	mm_kwargsc                c   sR    t | dd dD ]\}}dd |D }t|||dD ]
\}}|||fV  qq	dS )a  
    Group consecutive items (possibly from different requests) into batches.

    Items must be split across groups if any of the following occurs,
    as the batch would otherwise be invalid:
    - They have different fields (e.g. mixed image and embedding inputs).
    - They have different values in `MultiModalSharedField`.

    To simplify the implementation of `embed_multimodal`, we add another
    restriction that the items in a batch must belong to the same modality.

    Args:
        mm_kwargs: List of `(modality, item)`.
        device: The device to place the grouped tensors on.
        pin_memory: Whether to pin memory for faster host-to-device transfer.

    Yields:
        A tuple `(modality, num_items, grouped_kwargs)`, where:
        - `modality` is the modality of the batch;
        - `kwargs` is a dictionary of keyword arguments to pass to the model;
        - `num_items` is the corresponding number of items.
    c                 S   rh   ri   r    rL   r    r    r!   rN      rk   z+group_and_batch_mm_kwargs.<locals>.<lambda>rO   c                 S   s   g | ]\}}|qS r    r    )rD   rQ   rH   r    r    r!   rR      s    z-group_and_batch_mm_kwargs.<locals>.<listcomp>r]   N)r   ru   )rv   r^   r_   rE   rn   	items_lst	num_itemsmm_kwargs_batchr    r    r!   group_and_batch_mm_kwargs   s   rz   zu`group_mm_kwargs_by_modality` has been renamed to `group_and_batch_mm_kwargs`. The old name will be removed in v0.19.c                C   s   t | ||dS )Nr]   )rz   )rv   r^   r_   r    r    r!   group_mm_kwargs_by_modality   s   
r{   	audio_urlaudio_io_kwargsc                 C   &   |sdnd|i}t |dd}|| S )a+  
    Args:
        audio_url: URL of the audio file to fetch.
        audio_io_kwargs: Additional kwargs passed to handle audio IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    Nr   /media_io_kwargsallowed_local_media_path)r   fetch_audio)r|   r}   r   media_connectorr    r    r!   r        
r   	image_urlimage_io_kwargsc                 C   r~   )a+  
    Args:
        image_url: URL of the image file to fetch.
        image_io_kwargs: Additional kwargs passed to handle image IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    Nr2   r   r   )r   fetch_image)r   r   r   r   r    r    r!   r     r   r   	video_urlvideo_io_kwargsc                 C   r~   )a+  
    Args:
        video_url: URL of the video file to fetch.
        video_io_kwargs: Additional kwargs passed to handle video IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    Nr>   r   r   )r   fetch_video)r   r   r   r   r    r    r!   r   1  r   r   rB   )?r'   collectionsr   collections.abcr   r   	itertoolsr   typingr   r   numpynpnumpy.typingnptPILr   typing_extensionsr	   vllm.utils.import_utilsr
   hasherr   inputsr   r   r   r   r   mediar   r   r   r   torch.typesr   globalsndarrayintrc   r"   r-   r5   r7   NDArrayr<   r@   rd   rl   rV   r\   typesDeviceboolrf   ru   rz   r{   dictfloatr   r   r   r    r    r    r!   <module>   s*  










3
'

