o
    :/i$                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	Z
d dlmZ d dlmZ d dlmZ d dlmZ zd dlZd dlmZ W n eyV   edZedd	ZY nw eeZd
ejdeeef dejfddZd
ejdedejfddZ d
ejdedejfddZ!G dd deZ"G dd deZ#G dd dZ$e Z%G dd dZ&e%'dG dd de$e&Z(e%'d G d!d" d"e$e&Z)e%'d#G d$d% d%e$e&Z*e%'d&G d'd( d(e(Z+e%'d)G d*d+ d+e$e&Z,dS ),    N)abstractmethod)BytesIO)Any
NamedTuplecast)init_logger)PlaceholderModule)ExtensionManagercv2videoio_registryframessizereturnc                 C   s\   | j \}}}}|\}}tj||||f| jd}t| D ]\}}	t|	||f}
|
||< q|S )Ndtype)shapenpemptyr   	enumerater
   resize)r   r   
num_frames_channels
new_height	new_widthresized_framesiframeresized_frame r   b/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/multimodal/video.pyresize_video   s   
r!   size_factorc                 C   s4   | j \}}}}t|| }t|| }t| ||fS N)r   intr!   )r   r"   r   heightwidthr   r   r   r   r    rescale_video_size'   s   r'   r   c                 C   s<   | j d }|dkr| S tjd|d |td}| |df }|S )Nr      r   .)r   r   linspacer$   )r   r   total_framesframe_indicessampled_framesr   r   r    sample_frames_from_video/   s   
r.   c                   @   *   e Zd ZU dZeed< eed< eed< dS )VideoTargetMetadataz!Metadata represents target video.r   fpsmax_durationN__name__
__module____qualname____doc__r$   __annotations__floatr   r   r   r    r0   9   
   
 r0   c                   @   r/   )VideoSourceMetadataz!Metadata represents source video.total_frames_numoriginal_fpsdurationNr3   r   r   r   r    r;   A   r:   r;   c                
   @   st   e Zd Zedededee fddZee	de
deejeeef f fddZeded	ee d
efddZdS )VideoLoadersourcetargetr   c                 K      t )z:Return the list of frame indices to sample from the video.NotImplementedError)clsr@   rA   kwargsr   r   r    compute_frames_index_to_sampleJ      z*VideoLoader.compute_frames_index_to_sampledatac                 K   rB   )zFLoad video frames from bytes and return (frames_array, metadata_dict).rC   )rE   rI   rF   r   r   r    
load_bytesT   rH   zVideoLoader.load_bytesvalid_frame_indicesvideo_backendc                 C   s"   |j |j|j||t||j kdS )N)total_num_framesr1   r>   rL   frames_indicesdo_sample_frames)r<   r=   r>   len)rE   r@   rK   rL   r   r   r    create_hf_metadata^   s   zVideoLoader.create_hf_metadataN)r4   r5   r6   classmethodr;   r0   listr$   rG   r   bytestuplenptNDArraydictstrr   rJ   rQ   r   r   r   r    r?   I   s0    	r?   c                   @   s  e Zd Zedd ZededdfddZedddefd	d
Z	ede
dee
 dee
e
f de
def
ddZedddee
 de
deejee
 ee
e
f f fddZedee
 de
deejee
 f fddZedddddee
 de
dedeejee
 f f
ddZdS )OpenCVVideoBackendMixinc                  C   s^   d } t  D ]&}t |sqt |s(t |\}}}|dk s'|dkr(|dk r(q|}  | S | S )Nr)      )vrgetStreamBufferedBackends
hasBackendisBackendBuiltIn%getStreamBufferedBackendPluginVersion)api_prefbackendr   abiapir   r   r    get_cv2_video_apis   s   

z)OpenCVVideoBackendMixin.get_cv2_video_apirI   r   zcv2.VideoCapturec                 C   s.   |   }tt||g }| std|S )NzCould not open video stream)re   r
   VideoCapturer   isOpened
ValueError)rE   rI   rb   capr   r   r    open_video_capture   s
   z*OpenCVVideoBackendMixin.open_video_captureri   c                 C   s>   t | tj}| tj}|dkr|| nd}t|||dS )Nr   r<   r=   r>   )r$   getr
   CAP_PROP_FRAME_COUNTCAP_PROP_FPSr;   )ri   r<   r=   r>   r   r   r    get_video_metadata   s   z*OpenCVVideoBackendMixin.get_video_metadataidxfailed_framesnext_target_mapr+   c                 C   s$   |sdS |d }| ||}||k S )z;Check if current frame can recover the oldest failed frame.Fr   )rl   )rE   rp   rq   rr   r+   oldest_failedlimitr   r   r    _can_use_for_recovery   s
   	z-OpenCVVideoBackendMixin._can_use_for_recoveryr,   c                 C   s  t |tj}t |tj}|dkr|dks"J d| d| t|}|r,|d nd}i }tt|d D ]}	||	d  |||	 < q8|||d < g }
g }g }i }d}t|d D ]o}||v }| }|su|rtt	
d| || q[| ||||}|s|r| \}}|r|dur|jdkrt|tj}|
| || |d7 }|r|d}|||< t	d||||  q[|rt	
d	| || q[|D ]}t	
d
| q|
rt|
}ntjd||dftjd}|||fS )a  
        Read frames with dynamic window forward-scan recovery.

        When a target frame fails to load, the next successfully grabbed
        frame (before the next target frame) will be used to recover it.

        Args:
            cap: OpenCV VideoCapture object
            frame_indices: Sorted list of target frame indices to load
            total_frames: Total number of frames in the video

        Returns:
            Tuple of (frames_array, valid_frame_indices, recovered_map)
            - frames_array: Array of loaded frames
            - valid_frame_indices: List of frame indices that were loaded
            - recovered_map: Dict mapping recovered_idx -> source_idx
        r   z Invalid video frame size: width=z	, height=r(   r)   z-Failed to grab frame %d during video loading.Nz-Recovered frame %d using frame %d (delay: %d)z1Failed to retrieve frame %d during video loading.z/Frame %d could not be recovered (end of video).   r   )r$   rl   r
   CAP_PROP_FRAME_WIDTHCAP_PROP_FRAME_HEIGHTsetrangerP   grabloggerwarningappendru   retriever   cvtColorCOLOR_BGR2RGBpopinfor   stackr   uint8)rE   ri   r,   r+   r&   r%   frame_idx_setmax_frame_idxrr   kframes_listrK   failed_frames_idxrecovered_mapr   rp   is_target_frameokcan_recoverretr   	rgb_framerecovered_idx
failed_idxr   r   r   r    _read_frames_with_recovery   s~   





z2OpenCVVideoBackendMixin._read_frames_with_recoveryr   c                 C   s   t |}t|tj}t|tj}tj|||dftjd}d}g }	t	|d D ]8}
|
 }|s=|
|v r<td|
 q*|
|v rb| \}}|r\t|tj||< |	|
 |d7 }q*td|
 q*t |	}||k rutd|| || |d | |	fS )Nrv   r   r   r)   zIFailed to grab frame %d during video loading. This frame will be skipped.zMFailed to retrieve frame %d during video loading. This frame will be skipped.zgVideo loading completed with %d broken/unreadable frames. Expected %d frames but only loaded %d frames.)rP   r$   rl   r
   rw   rx   r   r   r   rz   r{   r|   r}   r   r   r   r~   )rE   ri   r,   r   num_expected_framesr&   r%   r   r   rK   rp   r   r   r   valid_num_framesr   r   r    _read_frames_no_recovery  sF   

z0OpenCVVideoBackendMixin._read_frames_no_recoveryF)frame_recovery	frame_idxr<   r   c                C   s   |rt |}| |||\}}}|rtdt | nt|}	t |	}| ||	t|\}}t |}
|
|k r@td||
 ||
 ||fS )Nz7Frame recovery: %d frames recovered using forward scan.zqVideo loading completed with %d broken/unreadable frames. Expected to sample %d frames but only loaded %d frames.)rP   r   r|   r   ry   r   maxr}   )rE   ri   r   r<   r   num_frames_to_sampler   rK   r   r   r   r   r   r    read_frames@  s2   	

z#OpenCVVideoBackendMixin.read_framesN)r4   r5   r6   staticmethodre   rR   rT   rj   r;   ro   r$   rS   rX   boolru   rU   rV   rW   r   ry   r   r   r   r   r   r    rZ   r   sd    


i2rZ   opencvc                   @   sl   e Zd Zedededee fddZe				dd	e	d
ededede
deejeeef f fddZdS )OpenCVVideoBackendr@   rA   r   c                 K   s   |j }|j}|j}|j}|}|dkrt||}|dkr%t|t|| }td|}||kr7tt	d|}	|	S t
jd|d |td}
|
 }	|	S )Nr   r)   r   )r<   r>   r   r1   minmathfloorr   rS   rz   r   r*   r$   tolist)rE   r@   rA   rF   r<   r>   r   r1   r   r   uniform_sampled_framesr   r   r    rG   h  s$   

z1OpenCVVideoBackend.compute_frames_index_to_sampler(   ,  FrI   r   r1   r2   r   c                 K   `   |  |}t|}t|||d}	| j||	d}
| j||
|j|d\}}| j|d|d}||fS )a  
        Load video frames from bytes.

        Args:
            data: Raw video bytes
            num_frames: Target number of frames to sample (-1 for all)
            fps: Target FPS for sampling (-1 for original)
            max_duration: Maximum duration (unused in base backend)
            frame_recovery: Enable forward-scan recovery for failed frames

        Returns:
            Tuple of (frames_array, metadata_dict)
        r   r1   r2   r@   rA   r<   r   r   r@   rL   rK   rj   rZ   ro   r0   rG   r   r<   rQ   )rE   rI   r   r1   r2   r   rF   ri   r@   rA   r   r   rK   metadatar   r   r    rJ     s.   


zOpenCVVideoBackend.load_bytesNr(   r(   r   Fr4   r5   r6   rR   r;   r0   rS   r$   rG   rT   r   rU   rV   rW   rX   rY   r   rJ   r   r   r   r    r   f  s6    r   opencv_dynamicc                   @   l   e Zd Zedededee fddZe					dd
e	dededede
deejeeef f fddZdS )OpenCVDynamicVideoBackendr@   rA   r   c                    s   |j }|j}|j|j}|j |j d ||kr2tt|  }t fddt	|D }|S t|  }	|	|krDt
t	|}|S tjd||	dd}
tfdd|
D }|S )Nr)   c              	      s(   h | ]}t tt|   qS r   r   r$   r   ceil).0r   r1   r   r=   r   r    	<setcomp>  s    zKOpenCVDynamicVideoBackend.compute_frames_index_to_sample.<locals>.<setcomp>r   T)endpointc              	      s$   h | ]}t  tt| qS r   r   r   t)r   r=   r   r    r     s    )r<   r>   r=   r2   r1   r$   r   r   sortedrz   rS   r   r*   )rE   r@   rA   rF   r<   r>   r2   nframe_indices_listnum_samplestarget_secondsr   r   r    rG     s2   
	z8OpenCVDynamicVideoBackend.compute_frames_index_to_sampler(   r[   r   FrI   r   r1   r2   r   c                 K   s   |  |}t|}|jd }	|jpt|	|j d }
t|j|j|
d}t|||d}| j	||d}| j
|||j|d\}}| j|d|d}||fS )  
        Load video frames with dynamic sampling based on duration.

        Args:
            data: Raw video bytes
            num_frames: Not used in dynamic backend
            fps: Target FPS for sampling (default: 2)
            max_duration: Maximum video duration to process (default: 300s)
            frame_recovery: Enable forward-scan recovery for failed frames

        Returns:
            Tuple of (frames_array, metadata_dict)
        r)   rk   r   r   r   r   r   )rj   rZ   ro   r<   r>   roundr=   r;   r0   rG   r   rQ   )rE   rI   r   r1   r2   r   rF   ri   orig_sourcer   r>   r@   rA   r   r   rK   r   r   r   r    rJ     s>   



z$OpenCVDynamicVideoBackend.load_bytesNr(   r[   r   Fr   r   r   r   r    r     s6    &r   molmo2c                   @   s^  e Zd Ze	d#dedededee fddZededed	ed
edee dedB fddZ	ededB d	ededede
edB ejf f
ddZe	d$deded
ededB dee dB dejfddZededefddZe					d%ded
edB dedededede
ejeeef f fdd Ze	d&dedede
ejeeef f fd!d"ZdS )'Molmo2VideoBackend       @	video_fpssampling_fpsmax_fpsr   c                 C   s   t |}t |}t |}|du rtd|dks|dkr'td| d| d|| dkr8td| d| d	g }t||d
 |D ]}||krK |S || dkrX|t| qB|S )a  
        Return the subset of `video_fps` factors that remain multiples
        of `sampling_fps`.

        Examples:
            >>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
            [2, 6]
            >>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
            [1, 5]
            >>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
            [2]
            >>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
            Traceback (most recent call last):
                ...
            ValueError: sampling_fps=2 must divide video_fps=5 to produce
                consistent frame steps.
        Nzsampling_fps must be providedr   z1video_fps and sampling_fps must be positive (got z, )zsampling_fps=z must divide video_fps=.r)   )r$   rh   rz   r~   r9   )rE   r   r   r   
candidates	candidater   r   r    get_candidate_target_fps)  s4   z+Molmo2VideoBackend.get_candidate_target_fps
max_framesr+   frame_sample_modecandidate_target_fpsNc                 C   s   d}d}|D ]8}t t|| d}	t||	 }
|dkr+d|v r&|
|kr& |S |}|
}q||
ks1J |
|kr6q|
|kr>|}|
}q|S )z]
        Get the target fps that best spans the videoand has the most frames sampled
        r   Nr)   uniform)r   r$   )rE   r   r   r+   r   r   num_frames_sampledselected_target_fps
target_fps	step_sizenum_frames_sampled_at_fpsr   r   r    get_target_fpsZ  s(   z!Molmo2VideoBackend.get_target_fpsr   c                 C   s^   |d u rt jd||dtd}ntt|| d}t d||}t||kr+|d | }||fS )Nr   F)r   r   r)   )r   r*   r$   r   arangerP   )rE   r   r+   r   r   r,   r   r   r   r    get_frame_times_and_chosen_fps  s   
z1Molmo2VideoBackend.get_frame_times_and_chosen_fpsr>   c                 K   s   |dkr1|d us
J |d }|dd  D ]}|| |k r n|}qt d|| }	|	|	|k  }	|	S |dkr{|d urn|d | }
|
|k rPt jd||dt jd}	|	S t jd|d| d}	t j|	|ggdd	}	t|	|kslJ |	S t jd||dt jd}	|	S t|)
Nr1   r   r)   uniform_last_frameT)numr   r           stopstepaxis)r   r   r*   float64concatenaterP   rD   )rE   r>   r   r   r   r   rF   r   candidate_fpstimesr2   r   r   r    sample_times  s:   
zMolmo2VideoBackend.sample_timesr@   rA   c                 K   s  | d}| d}|d u rttd|jS |dvr!td| |j}|j}|j}|j}	|j}
|dkr|d ur|dkrHt	
|t}| S ||	d | kret	jd|d t|	|d	d
t}| S t	j
d|d t|| d}t	|d |d krt	j||d ggdd}t	|t}|d |k sJ t||	ksJ | S |dkrt	jd|d t|	|d	d
t}| S |dkr| ||
}| ||	|||}| |||	|\}}| S )Nr   r   r   >   r1   r   zUnsupported frame_sample_mode: r   r[   r)   T)r   r   r   r   r(   r   r1   )rl   rS   rz   r<   rD   r>   r=   r   r1   r   r   astyper$   r*   r   r9   r   r   rP   r   r   r   r   )rE   r@   rA   rF   r   r   r>   r   rM   r   r   indicesfloat_indicesr   r   r   r   r   r    rG     s   

,)
z1Molmo2VideoBackend.compute_frames_index_to_sampler(   r[   FrI   r   r   c                 K   sf   |  |}t|}	t|||	jd}
| j|	|
||d}| j|||	j|d\}}| j|	d|d}||fS )Nr   )r@   rA   r   r   r   r   r   )	rj   rZ   ro   r0   r>   rG   r   r<   rQ   )rE   rI   r   r   r   r   r   rF   ri   r@   rA   r   r   rK   r   r   r   r    load_bytes_opencv  s2   


z$Molmo2VideoBackend.load_bytes_opencvc                 K   sX   t td B |dd }t t|dd}t t|dd}| j|||||fi |}|S )Nr   r   r[   r   )r   rY   r   r$   r   )rE   rI   r   rF   r   r   r   outr   r   r    rJ   0  s   zMolmo2VideoBackend.load_bytes)r   r#   )Nr(   r[   r[   F)r(   )r4   r5   r6   rR   r9   rS   r   r$   rY   r   rU   rV   rW   r   r   r;   r0   rG   rT   r   rX   r   r   rJ   r   r   r   r    r   '  s    0'
)F	)r   nemotron_vlc                   @   sN   e Zd Ze				ddededededed	eej	e
eef f fd
dZdS )NemotronVLVideoBackendr(   r   FrI   r   r1   r2   r   r   c           	      K   s8   t j|f||||d|\}}t|}||d< ||fS )N)r   r1   r2   r   original_video_bytes)r   rJ   rX   )	rE   rI   r   r1   r2   r   rF   r   r   r   r   r    rJ   G  s   

	z!NemotronVLVideoBackend.load_bytesNr   )r4   r5   r6   rR   rT   r$   r   rU   rV   rW   rX   rY   r   rJ   r   r   r   r    r   E  s&    r   	openpanguc                   @   r   )"OpenCVDynamicOpenPanguVideoBackendr@   rA   r   c           	         s   |j |j |j}|j}dkr dkrd   }nd}|dkr=|t|| d kr<t|| d }t||d | }n|dkrHtd| tjd||t	d} fdd|D }|S )Nr)   r   r(   z5requires dataset fps is -1 or greater than 0 but got r   c                    s"   g | ]}t d  t|  qS )r)   )r   r   r   r=   r<   r   r    
<listcomp>  s    zUOpenCVDynamicOpenPanguVideoBackend.compute_frames_index_to_sample.<locals>.<listcomp>)
r<   r=   r   r1   r$   r   rh   r   r*   r9   )	rE   r@   rA   rF   r   r1   total_durationsample_frame_timestampsrN   r   r   r    rG   b  s.   zAOpenCVDynamicOpenPanguVideoBackend.compute_frames_index_to_sampler(   r[   r   FrI   r   r1   r2   r   c                 K   r   )r   r   r   r   r   r   r   )rE   rI   r   r1   r2   r   rF   ri   r@   rA   r   r   rK   r   r   r   r    rJ     s.   


z-OpenCVDynamicOpenPanguVideoBackend.load_bytesNr   r   r   r   r   r    r   `  s6    )r   )-r   abcr   ior   typingr   r   r   numpyr   numpy.typingrV   vllm.loggerr   vllm.utils.import_utilsr   vllm.utils.registryr	   r
   cv2.videoio_registryr   r\   ImportErrorplaceholder_attrr4   r|   rW   rU   r$   r!   r9   r'   r.   r0   r;   r?   VIDEO_LOADER_REGISTRYrZ   registerr   r   r   r   r   r   r   r   r    <module>   sL   "
& uVi  