
    vjo3                        d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
Zd dlZd dlZd dlmc mZ d dlmc mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lm Z  ddl!m"Z" ddl#m$Z$ 	 	 ddZ%d Z&d Z'	 ddZ(ddZ) G d de*          Z+ e$j,        ej-        ej.                   G d de"                      Z/dS )    N)exists)TemporaryDirectory)urlparse)VideoReader)Compose)http_get_file)Preprocessors)FieldsModeKeys)type_assert   )Preprocessor)PREPROCESSORSc                    t          |          }|j        dv r&t          |j                  rt	          | ||          }nt                      5 }t          j                    j        }t          |||d           t          j                            ||          }t	          | ||          }ddd           n# 1 swxY w Y   ||}	t          | |          }
n&| j        j        }	t          | | j        j                  }
g }t          |                    d                    D ]X}t          |	          D ]F}|
j        d                             |           |                     |
||                              GYt)          j        |d          S )aw   simple interface to load video frames from file

    Args:
        cfg (Config): The global config object.
        video_path (str): video file path
        num_spatial_crops_override (int): the spatial crops per clip
        num_temporal_views_override (int): the temporal clips per video
    Returns:
        data (Tensor): the normalized video clips for model inputs
    )file N)url	local_dir	file_namecookiesr   r   dim)r   schemer   path_decode_videor   uuiduuid4hexr   osjoinkinetics400_tranformTESTNUM_SPATIAL_CROPSrangesize
transformsset_spatial_indexappendtorchstack)cfg
video_pathnum_spatial_crops_overridenum_temporal_views_override
url_parseddatatemporary_cache_dir
random_strtemp_file_pathnum_spatial_crops	transform	data_listijs                 n/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/modelscope/preprocessors/video.pyReadVideoDatar:      s    *%%JL((VO. .(S*.IJJ!! 		>%8)J-$	   
  W\\*=zJJN n!<> >D		> 		> 		> 		> 		> 		> 		> 		> 		> 		> 		> 		> 		> 		> 		> "-6(.HII		H6(ch.HII	I499Q<<   1 1()) 	1 	1A #55a888YYtAw//0000	1 ;ya((((s   AB66B:=B:c                    t          | j        j        | j        j        g| j        j        |          }t	          j                    |t	          j        | j        j        | j        j        d          g}t          |          S )aI  
    Configs the transform for the kinetics-400 dataset.
    We apply controlled spatial cropping and normalization.
    Args:
        cfg (Config): The global config object.
        num_spatial_crops (int): the spatial crops per clip
    Returns:
        transform_function (Compose): the transform function for input clips
    )short_side_range	crop_sizer4   T)meanstdinplace)
KineticsResizedCropDATA
TEST_SCALETEST_CROP_SIZEr&   ToTensorVideoNormalizeVideoMEANSTDr   )r+   r4   resize_videostd_transform_lists       r9   r!   r!   E   s     '(-sx/BC()+- - -L
 	 ""L!CHL$	@ 	@ 	@
 %&&&    c                 |   |dk    rt          j        d| dz
            g}n||z  |z  |z  }	t          | |	z
  d          }
|dk    r|
dz  }n|t          j        |
|dz
  z            z  }|r	||	z   |z
  }n||	z   dz
  }t          j        |||          }t          j        |d| dz
                                            }|S )a  
        Generates the frame index list using interval based sampling.

        Args:
            vid_length (int): the length of the whole video (valid selection range).
            vid_fps (int): the original video fps
            target_fps (int): the normalized video fps
            clip_idx (int):
                -1 for random temporal sampling, and positive values for sampling specific
                clip from the video
            num_clips (int):
                the total clips to be sampled from each video. combined with clip_idx,
                the sampled video is the "clip_idx-th" video from "num_clips" videos.
            num_frames (int): number of frames in each sampled clips.
            interval (int): the interval to sample each frame.
            minus_interval (bool): control the end index

        Returns:
            index (tensor): the sampled frame indexes
    r   r      )	randomrandintmaxmathfloorr)   linspaceclamplong)
vid_lengthvid_fps
target_fpsclip_idx	num_clips
num_framesintervalminus_intervalindexclip_lengthmax_idx	start_idxend_idxs                r9   _interval_based_samplingrc   [   s    , Q:>223 !8+g5
Bj;.22>>!II 4:gQ.G#H#HHI 	2+-8GG+-1Gy':>>E1j1n55::<<LrK   c                    t          t                    sJ ||}n| j        j        }g }t	          |          D ]}t          t                    || j        j        ||| j        j	        | j        j
        | j        j                  }d}t          j        t          j        fd|                                D             d                    }|                    |           t          j        |          }~	|S )a  
        Decodes the video given the numpy frames.
        Args:
            cfg          (Config): The global config object.
            frames_list  (list):  all frames for a video, the frames should be numpy array.
            vid_fps      (int):  the fps of this video.
            num_temporal_views_override (int): the temporal clips per video
        Returns:
            frames            (Tensor): video tensor data
    Nc                      g | ]
}|         S  rf   ).0r^   frames_lists     r9   
<listcomp>z-_decode_video_frames_list.<locals>.<listcomp>   s    EEEUk%(EEErK   r   )axis)
isinstancelistr"   NUM_ENSEMBLE_VIEWSr$   rc   lenrB   
TARGET_FPSNUM_INPUT_FRAMESSAMPLING_RATEMINUS_INTERVALr)   
from_numpynpr*   tolistr(   )
r+   rh   rW   r.   num_clips_per_video
frame_listrY   list_framesvrs
    `        r9   _decode_video_frames_listr{      s    k4(((((".9!h9J-.. " " )HH%H"H#	
 	
 !HEEEEellnnEEEANNNP P&!!!![$$F
MrK   c                 f   t          |          }||}n| j        j        }g }t          |          D ]h}t	          t          |          |                                | j        j        ||| j        j	        | j        j
        | j        j                  }d}|                    d          rt          j        d|d         d          }	t          j        |                    t          j        |	|g                                                                                              }||	j        d         d         }nKt          j        |                    |                                                                                    }|                    |           jt          j        |          }~|S )aK  
        Decodes the video given the numpy frames.
        Args:
            cfg          (Config): The global config object.
            path          (str): video file path.
            num_temporal_views_override (int): the temporal clips per video
        Returns:
            frames            (Tensor): video tensor data
    Nz.avir      )r   r"   rm   r$   rc   rn   get_avg_fpsrB   ro   rp   rq   rr   endswithr)   arangedlpackfrom_dlpack	get_batchcat	to_dlpackcloneshaper(   r*   )
r+   r   r.   rz   rv   rw   rY   rx   ry   append_lists
             r9   r   r      s    
T		B".9!h9J-.. " " )GGNNHH%H"H#	
 	
 ==   	9,q%(A66K'UY(-(/ 0 0 1 11:> >>Cegg  K-a0112FF'U##--//1 116 &!!!![$$F
MrK   c                   4    e Zd ZdZ	 d	dZd Zd Zd Zd ZdS )
rA   a@  Perform resize and crop for kinetics-400 dataset
    Args:
        short_side_range (list): The length of short side range. In inference, this should be [256, 256]
        crop_size         (int): The cropped size for frames.
        num_spatial_crops (int): The number of the cropped spatial regions in each video.
    r   c                 X    d| _         || _        t          |          | _        || _        d S )N)idxr<   intr=   r4   )selfr<   r=   r4   s       r9   __init__zKineticsResizedCrop.__init__   s.      0Y!2rK   c                 "   |j         \  }}}}| j        d         }||k     rNt          |          }t          ||z  |z            }t          j        j                            |||fd          }nMt          |          }t          ||z  |z            }t          j        j                            |||fd          }t          || j        z
            }	t          || j        z
            }
| j        dk    r|	dz  }|
dz  }nn| j        dk    rc| j	        dk    r||k    r|	dz  }d}nJ||k    rd}|
dz  }n<| j	        dk    r|	dz  }|
dz  }n&| j	        dk    r||k    r|	dz  }|
}n||k    r|	}|
dz  }|dddd||| j        z   ||| j        z   f         S )zPerform controlled crop for video tensor.
        Args:
            clip (Tensor): the video data, the shape is [T, C, H, W]
        r   bilinearr%   moder   rM      N)
r   r<   r   r)   nn
functionalinterpolater=   r4   r   )r   clip_clip_height
clip_widthlengthnew_clip_heightnew_clip_widthnew_clipx_maxy_maxxys                r9   _get_controlled_cropz(KineticsResizedCrop._get_controlled_crop   s   
 )-
%1k:&q)##!&kkO k!9O!KLLNx*66O^<: 7 O OHH ![[N!+
":^"KLLOx*66O^<: 7 O OHNT^344Odn455!Q&&
A
AA#q((x1}}!V++
AAA$..A
AQQJQJQ!V++
AAA$..A
A111aDN 22Aa$.6H4HHIIrK   c                 `   |j         \  }}}}t          ||          }t          ||          }t          t	          j        | j                   }t          ||z  |z            }||k     r|}	|}
n|}	|}
t          j        j	        
                    ||	|
fd          }t          |
| j        z
            }t          |	| j        z
            }t          t	          j        d|                    }t          t	          j        d|                    }|d d d d ||| j        z   ||| j        z   f         S )Nr   r   r   )r   minrP   r   rN   uniformr<   r)   r   r   r   r=   )r   r   r   r   r   
short_side	long_sidenew_short_sidenew_long_sider   r   r   r   r   r   r   s                   r9   _get_random_cropz$KineticsResizedCrop._get_random_crop  s<   (,
%1k:j11
Z00	V^T-BCDDI
2^CDD##,O*NN+O+N8&228z 3 K K NT^344Odn455q%(())q%(())111aDN 22Aa$.6H4HHIIrK   c                     || _         dS )zSet the spatial cropping index for controlled cropping..
        Args:
            idx (int): the spatial index. The value should be in [0, 1, 2], means [left, center, right], respectively.
        N)r   )r   r   s     r9   r'   z%KineticsResizedCrop.set_spatial_index6  s    
 rK   c                 ,    |                      |          S N)r   )r   r   s     r9   __call__zKineticsResizedCrop.__call__=  s    ((...rK   N)r   )	__name__
__module____qualname____doc__r   r   r   r'   r   rf   rK   r9   rA   rA      sz          		3 	3 	3 	3*J *J *JXJ J J.  / / / / /rK   rA   )module_namec                   T     e Zd Z fdZd Zd Z eee          d             Z xZ	S )"MovieSceneSegmentationPreprocessorc                     t                      j        |i | |                    dd          | _        |                    t          j        d          | _        |                    t          j        d          | _        |                    dd          | _	        ddl
m}  || j                  | _         || j                  | _        dS )z7
        movie scene segmentation preprocessor
        is_trainTNnum_keyframer   r   )get_transform)superr   popr   r   TRAINpreprocessor_train_cfgEVALpreprocessor_test_cfgr   movie_scene_segmentationr   train_transformtest_transform)r   argskwargsr   	__class__s       r9   r   z+MovieSceneSegmentationPreprocessor.__init__E  s     	$)&)))

:t44&,jj&F&F#%+ZZt%D%D""JJ~q99;;;;;;,}T-HII+mD,FGGrK   c                     d| _         d S )NTr   r   s    r9   trainz(MovieSceneSegmentationPreprocessor.trainT  s    rK   c                     d| _         d S )NFr   r   s    r9   evalz'MovieSceneSegmentationPreprocessor.evalX  s    rK   c                     | j         r| j        }n| j        }t          j         ||          d          }|                    d| j        ddd          }|S )Nr   r   r   r      )r   r   r   r)   r*   viewr   )r   resultsr&   s      r9   r   z+MovieSceneSegmentationPreprocessor.__call__\  s]    = 	--JJ,J+jj11q999,,r4#4acBBrK   )
r   r   r   r   r   r   r   objectr   __classcell__)r   s   @r9   r   r   A  s        H H H H H     [    !     rK   r   )NNr   )0rQ   r   rN   r   os.pathr   tempfiler   urllib.parser   numpyrt   r)   torch.utils.datatorch.utils.dlpackutilsr   (torchvision.transforms._transforms_videor&   _transforms_videodecordr   torchvision.transformsr   modelscope.hub.file_downloadr   modelscope.metainfor	   modelscope.utils.constantr
   r   modelscope.utils.type_assertr   baser   builderr   r:   r!   rc   r{   r   r   rA   register_modulecv%movie_scene_segmentation_preprocessorr   rf   rK   r9   <module>r      sP    				         ' ' ' ' ' ' ! ! ! ! ! !          # # # # # # # # # = = = = = = = = =       * * * * * * 6 6 6 6 6 6 - - - - - - 6 6 6 6 6 6 6 6 4 4 4 4 4 4       " " " " " "
 .2.2)) )) )) ))X' ' ',) ) )^ ;?( ( ( (V+ + + +\^/ ^/ ^/ ^/ ^/& ^/ ^/ ^/B 
I=NP P P" " " " " " "P P" " "rK   