
    vj                        d dl Z d dlZd dlmZmZmZmZmZmZm	Z	m
Z
mZ d dlZd dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlm Z m!Z! d d	l"m#Z# d d
l$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; d dl<m=Z= d dl>m?Z?m@Z@ d dlAmBZB  eB            ZCdefdZD G d d          ZEdS )    N)	AnyCallableDictIterableListMappingOptionalSequenceUnion)DatasetDatasetDictFeaturesIterableDatasetIterableDatasetDict)_PACKAGED_DATASETS_MODULES)DatasetRepository)DatasetContextConfig)LocalDataLoaderManagerLocalDataLoaderTypeRemoteDataLoaderManagerRemoteDataLoaderType)ExternalDatasetNativeIterableDataset)build_custom_dataset)DatasetDeleteManager)load_dataset_with_ctx)DatasetUploadManager)build_preprocessor)Config
ConfigDict)MS_DATASETS_CACHE)
DEFAULT_DATASET_NAMESPACEDEFAULT_DATASET_REVISIONREPO_TYPE_DATASETConfigFieldsDatasetFormationsDownloadModeHubsModeKeysTasks
UploadMode)is_relative_path)is_tf_availableis_torch_available)
get_loggerreturnc                     | g } nXt          | t                    r| g} n?t          t          |                     t          |           k     rt	          d|            | S )Nz"List columns contains duplicates: )
isinstancestrlenset
ValueError)paras    p/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/modelscope/msdatasets/ms_dataset.pyformat_listr9   )   sc    |	D#		 Fv	SYY#d))	#	#DdDDEEEK    c            +          e Zd ZU dZdZdZeed<   	 dJdee	e
eef         dee         fdZd Zd Zd	 Zed
             Zed             Ze	 dJdee	eef         dedeed f         fd            Ze	 dJdee	eeee
ef         dedeed f         fd            Zeedeej         dddde!j"        e#ddd e$            dddfdeee%f         dee         dee         dee         dee         dee         dee         dee         deeee&e         e'eeee&e         f         f         f                  dee!         dee         dee(         dee)         dee*         dee$         d ee         d!ee)         d"ee)         deed ef         f&d#            Z+eeeddd$e,j-        fd%ed&ededee         dee         d'ee*         d(ee*         d)ee)         d*ee,         ddfd+            Z.eeddfd,ed-ed.ee         d/ee         d0ee         ddfd1            Z/eedddfd,ed2ed.ee         d/ee         d0ee         d3e)ddfd4            Z0eeefd%ededee         dee         def
d5            Z1	 	 	 	 	 dKd6eee2e         f         d7ee3e2e3         f         d8ed9e4d:e)f
d;Z5	 	 	 	 	 	 	 dLd<e*d=e)d7ee3e2e3         f         d6eee2e         f         d>e3d?e)d@e6ee7f         dAeee2e         f         dBe)fdCZ8de	fdDZ9dEe6eef         de	fdFZ:	 	 dMd7ee3e2e3         f         d6eee2e         f         d:e)fdGZ;	 	 	 	 dNd<e*d=e)d7ee3e2e3         f         d?e)dBe)dAeee2e         f         d6eee2e         f         fdHZ<	 	 dOde$fdIZ=dS )P	MsDataseta  
    ModelScope Dataset (aka, MsDataset) is backed by a huggingface Dataset to
    provide efficient data access and local storage managements. On top of
    that, MsDataset supports the data integration and interactions with multiple
    remote hubs, particularly, ModelScope's own Dataset-hub. MsDataset also
    abstracts away data-access details with other remote storage, including both
    general external web-hosted data and cloud storage such as OSS.
    N_dataset_context_configds_instancetargetc                     || _         |L|| j         j        vr>t          dt          | j         j                                                   d|           || _        d| _        d S )Nz)"target" must be a column of the dataset(z
, but got F)_hf_dsfeatures	TypeErrorlistkeysr?   	is_custom)selfr>   r?   s      r8   __init__zMsDataset.__init__@   ss     "&0D"D"DqDAUAZAZA\A\<]<]qqioqq   r:   c              #   T   K   | j         D ]}| j        || j                 V  |V  d S N)rA   r?   )rG   items     r8   __iter__zMsDataset.__iter__L   sJ      K 	 	D{&4;'''''



		 	r:   c                     | j         |         S rJ   rA   )rG   keys     r8   __getitem__zMsDataset.__getitem__S   s    {3r:   c                 *    t          | j                  S rJ   )r4   rA   rG   s    r8   __len__zMsDataset.__len__V   s    4;r:   c                     | j         S rJ   rN   rR   s    r8   r>   zMsDataset.ds_instanceY   s
    {r:   c                 R    t          | j        t                    r| j        j        S d S rJ   )r2   rA   r   config_kwargsrR   s    r8   rV   zMsDataset.config_kwargs]   s&    dk?33 	;,,4r:   hf_dsr0   c                      t          j        dt                     t          |t                    r  |          S t          |t
                    r~t          |                                          dk    r8  t          t          |
                                                              S  fd|                                D             S t          |t                    r  |          S t          dt          |                     )z
        @deprecated
        This method is deprecated and may be removed in future releases, please use `to_ms_dataset()` instead.
        z@from_hf_dataset is deprecated, please use to_ms_dataset instead.   c                 0    i | ]\  }}| |          S  r[   .0kvclsr?   s      r8   
<dictcomp>z-MsDataset.from_hf_dataset.<locals>.<dictcomp>t   s)    @@@$!QAss1f~~@@@r:   z2"hf_ds" must be a Dataset or DatasetDict, but got )warningswarnDeprecationWarningr2   r   r   r4   rE   nextitervaluesitemsr   rC   type)r`   rW   r?   s   ` `r8   from_hf_datasetzMsDataset.from_hf_datasetd   s    	N	  	  	  eW%% 	3uf%%%{++ 		5::<<  A%%s4U\\^^ 4 455v>>>@@@@@%++--@@@@// 	3u::RT%[[RR  r:   c                     t          |t                    r  |          S t          |t                    r~t          |                                          dk    r8  t          t          |                                                              S  fd|                                D             S t          |t                    r  |          S t          |t                    r  |          S t          |t                    r  |          S t          |t                    r~t          |                                          dk    r8  t          t          |                                                              S  fd|                                D             S t          dt          |                     )z&Convert input to `MsDataset` instance.rY   c                 0    i | ]\  }}| |          S r[   r[   r\   s      r8   ra   z+MsDataset.to_ms_dataset.<locals>.<dictcomp>   )    FFF$!QAss1f~~FFFr:   c                 0    i | ]\  }}| |          S r[   r[   r\   s      r8   ra   z+MsDataset.to_ms_dataset.<locals>.<dictcomp>   rm   r:   z8"ds_instance" must be a Dataset or DatasetDict, but got )r2   r   r   r4   rE   re   rf   rg   rh   r   r   r   r   rC   ri   )r`   r>   r?   s   ` `r8   to_ms_datasetzMsDataset.to_ms_dataset|   s    k7++ 	3{F+++[11 	;##%%&&!++s4[%7%7%9%9 : :;;VDDDFFFFF+2C2C2E2EFFFF_55 	3{###%:;; 	3{###_55 		3{###%899 	;##%%&&!++s4[%7%7%9%9 : :;;VDDDFFFFF+2C2C2E2EFFFF^4P[K\K\^^  r:   FrY   dataset_name	namespaceversionhubsubset_namesplitdata_dir
data_filesdownload_mode	cache_dirrB   use_streamingstream_batch_size
custom_cfgtokendataset_info_onlytrust_remote_codec                 
   |r%ddl m}  |            }|                    |           t          |	pt          j                  }	t          |pt
          j                  }|t
          j        k    }t          | t                    s4t          | t                    st          dt          |                      t          | t                    r6|d}t          j        || i          }t                              ||          S t"          j                            |           } t"          j                            |           }t+          |           rl|                     d          dk    rS|sQ|sO|                     d          }|d                                         }|d                                         } |r| sd	|rt2                              d
|  d           |                    d          dk    rIddl}ddl}	 |                    |j                   n%# t@          $ r |                    d           Y nw xY wtC          d| |||||||||	|
|||d|}| tD          v s>t"          j        #                    |           st"          j        $                    |           r~tK          |          &                    tN          j(                  }t                              ||          }t          |t                    r||_)        |r |j*        dd|i| d|_+        |S |t
          j        k    rddl,m&}  || f|||||
||	j-        |||d
|S |t
          j        k    rCddl m}  |            }|.                    |dz   | z   t^                    }|0                    | ||          \  }} t          |           t          tb          j2        j-                  k    rBtg          d|dz   | z   |||||
|d|	j-        |||||d|5 }!|!cddd           S # 1 swxY w Y   dS ti          |          }"|"&                    tj          j6                  }t                              ||          }t          |t                    r$|"j7        |_)        |r |j*        dd|i| d|_+        |S |t
          j8        k    rts          j:        dtv                     ddl<m=}# ddl>m?}$ |t          k    r|$jA        |_B        |t          k    r|$jD        |_E        |
t          k    r/ddlGmH}% t"          j        I                    |%ddd          }
|
|_J         |#|          }&|&K                                 |&jL        S d) a'
  Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.

            Args:
                dataset_name (str): Path or name of the dataset.
                    The form of `namespace/dataset_name` is also supported.
                namespace(str, optional): Namespace of the dataset. It should not be None if you load a remote dataset
                    from Hubs.modelscope,
                namespace (str, optional):
                    Namespace of the dataset. It should not be None if you load a remote dataset
                    from Hubs.modelscope,
                target (str, optional): Name of the column to output.
                version (str, optional): Version of the dataset script to load:
                subset_name (str, optional): Defining the subset_name of the dataset.
                data_dir (str, optional): Defining the data_dir of the dataset configuration. I
                data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
                split (str, optional): Which split of the data to load.
                hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
                download_mode (DownloadMode or str, optional): How to treat existing datasets. default
                                                               DownloadMode.REUSE_DATASET_IF_EXISTS
                cache_dir (str, Optional): User-define local cache directory.
                use_streaming (bool, Optional): If set to True, no need to download all data files.
                                                Instead, it streams the data progressively, and returns
                                                NativeIterableDataset or a dict of NativeIterableDataset.
                stream_batch_size (int, Optional): The batch size of the streaming data.
                custom_cfg (str, Optional): Model configuration, this can be used for custom datasets.
                                           see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
                token (str, Optional): SDK token of ModelScope.
                dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict).
                trust_remote_code (bool, Optional): If set to True, trust the remote code. Default to `False`.
                **config_kwargs (additional keyword arguments): Keyword arguments to be passed

            Returns:
                MsDataset (MsDataset): MsDataset object for a certain dataset.
            r   )HubApiz.dataset_name must be `str` or `list`, but got Nr?   )r?   /rY   zUThe dataset_name should be in the form of `namespace/dataset_name` or `dataset_name`.z3Use trust_remote_code=True. Will invoke codes from z9. Please make sure that you can trust the external codes.enginepythoni)rp   rq   rr   rt   ru   r?   rs   rv   rw   rx   cache_root_dirrz   r{   r   r|   T)load_dataset)
namerv   rw   ru   ry   rB   rx   revisionr}   	streaming)repo_id	repo_type)rp   rq   endpoint)pathr   rv   rw   ru   ry   rB   download_configrx   r   r}   r   r~   r   zMThe option `Hubs.virgo` is deprecated, will be removed in the future version.)VirgoDownloader)VirgoDatasetConfig)
CACHE_HOMEvirgors   datasetszPlease adjust input args to specify a loading mode, we support following scenes: loading from local disk, huggingface hub and modelscope hub.r[   )Mmodelscope.hub.apir   loginr'   REUSE_DATASET_IF_EXISTSr(   
modelscopehuggingfacer2   r3   rD   rC   ri   r   	from_dictr<   ro   osr   
expanduserexistsr,   countru   striploggerwarninggetcsvsysfield_size_limitmaxsizeOverflowErrorr   r   isdirisfiler   r   r   HF_DATA_LOADERr=   to_custom_datasetrF   r   valueget_endpoint_for_readr$   get_dataset_id_and_typer&   generalr   r   r   MS_DATA_LOADERdataset_context_configr   rb   rc   rd   -modelscope.msdatasets.data_loader.data_loaderr   modelscope.utils.constantr   r"   default_virgo_namespacerq   r#   default_dataset_versionrr   r!   modelscope.utils.config_dsr   joinr   processdataset)'rp   rq   r?   rr   rs   rt   ru   rv   rw   rx   ry   rB   rz   r{   r|   r}   r~   r   rV   r   apiis_huggingface_hubdataset_instis_local_pathdataset_name_split
csv_moduler   r   r   _apir   dataset_id_on_hubdataset_typedataset_resremote_dataloader_managerr   r   r   virgo_downloaders'                                          r8   loadzMsDataset.load   sI   x  	111111&((CIIe$] &M(4(LN N3)$/**!T%55,,, 	Zd6$ 6$ 	UlASASUU   lD)) 	H~!",fl-CDDL**<*GGGw)),77|44L)) 	nl.@.@/ // /"//8J/!-!3!3C!8!8*1-3355I-a06688L nL nmm 	5NN4l 4 4 45 5 5
 X&&(22$$$$JJJ8++CK8888  8 8 8++J777778 "6 "%#!'$'//" " " "$ 555: :5!#!=!= 61&( ((4'6)8 )8  %22<2OOL,	22 27M4 22L2 @ @#-@1>@ @ @-1L*D$$$------<! !%#!+1 '! !  ! ! ! DO##111111688D11!C,6+ 2 - -H /3.J.J)#! /K /# /#+| <  C(9(A(G$H$HHH* )&_|;(!)#-#"+!)(,&3&9!(#"/*;*;) ) () ) ' -8&#' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '* -D*-, -,)8EE(7 9  9(66   7  1  1lI66 6;T;kL8! 666 D D'1D5BD D D15.##DJM9:LN N N VUUUUUDDDDDD5553E3]&02221C1[&.---AAAAAAGLLWe)35 5	8A&5./EFF$$&&&#++Q Qs$   =H H:9H:?PPPTobject_namelocal_file_pathnum_processes	chunksizefilter_hidden_filesupload_modec	                    	 t          j        dt                     | st          d          t	          |||          }	t          |pt
          j                  }t          j        	                    |          r|	
                    | ||           dS t          j                            |          r|	                    | |||||           dS t          | d          )z
        @deprecated
        This method is deprecated and may be removed in future releases, please use git command line instead.
        zThe function `upload` is deprecated, please use git command or modelscope.hub.api.HubApi.upload_folder or modelscope.hub.api.HubApi.upload_file.zobject_name cannot be empty!rp   rq   rr   )r   r   r   )object_dir_namelocal_dir_pathr   r   r   r   z& is not a valid file path or directoryN)rb   rc   rd   r6   r   r+   	OVERWRITEr   r   r   uploadr   
upload_dir)
r   r   rp   rq   rr   r   r   r   r   _upload_managers
             r8   r   zMsDataset.upload  s5   	. 	8 :L		M 	M 	M  	=;<<<.%GM M M !!D
0DEE7>>/** 	L""' /' # ) ) ) ) ) W]]?++ 
	L&& +.+#$7' ' ) ) ) ) ) "JJJL L Lr:   dataset_work_dir
dataset_idr   
auth_tokengit_pathc                 F   t          j        dt                     t          | ||||          }|                                }|r/t
                              d                    |                     dS t
                              d                    |                     dS )a  Clone meta-file of dataset from the ModelScope Hub.

        Args:
            dataset_work_dir (str): Current git working directory.
            dataset_id (str): Dataset id, in the form of your-namespace/your-dataset-name .
            revision (str, optional):
                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
            auth_token (str, optional):
                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
                as the token is already saved when you login the first time, if None, we will use saved token.
            git_path (str, optional):
                The git command line path, if None, we use 'git'
        Returns:
            None
        zWThe function `clone_meta` is deprecated, please use git command line to clone the repo.repo_work_dirr   r   r   r   zAlready cloned repo to: {}zRepo dir already exists: {}N)	rb   rc   rd   r   cloner   infoformatr   )r   r   r   r   r   _repoclone_work_dirs          r8   
clone_metazMsDataset.clone_meta  s    , 	e	  	  	  "*!!    	FKK4;;NKKLLLLLNN-44^DDF F F F Fr:   commit_messageforcec                     t          j        dt                     t          | d|||          }|                    |||           dS )aU  Upload meta-file of dataset to the ModelScope Hub. Please clone the meta-data from the ModelScope Hub first.

        Args:
            dataset_work_dir (str): Current working directory.
            commit_message (str): Commit message.
            revision(`Optional[str]`):
                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
            auth_token(`Optional[str]`):
                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
                as the token is already saved when you log in the first time, if None, we will use saved token.
            git_path:(`Optional[str]`):
                The git command line path, if None, we use 'git'
            force (Optional[bool]): whether to use forced-push.

        Returns:
            None

        zuThe function `upload_meta` is deprecated, please use git command or CLI `modelscope upload owner_name/repo_name ...`. r   )r   branchr   N)rb   rc   rd   r   push)r   r   r   r   r   r   r   s          r8   upload_metazMsDataset.upload_meta  sh    2 	C 		  	  	  "*!   	

.
OOOOOr:   c                     t          |||          }|                    |           }t                              d|  d           |S )as   Delete object of dataset. Please log in first and make sure you have permission to manage the dataset.

        Args:
            object_name (str): The object name of dataset to be deleted. Could be a name of file or directory. If it's
                directory, then ends with `/`.
                For example: your-data-name.zip, train/001/img_001.png, train/, ...
            dataset_name (str): Path or name of the dataset.
            namespace(str, optional): Namespace of the dataset.
            version (str, optional): Version of the dataset.

        Returns:
            res_msg (str): Response message.

        r   )r   zObject z successfully removed!)r   deleter   r   )r   rp   rq   rr   _delete_managerresp_msgs         r8   r   zMsDataset.delete  s[    & /%GM M M"))k)BBAkAAABBBr:   columnspreprocessors	task_namedata_config	to_tensorc                    t                      st          d          t          | j        t                    rF|                    d|i           |                    | j        j                   t          ||          S ||                     |||          S | j        	                                 | j        
                    d||           | j        S )aF  Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
           torch.utils.data.DataLoader.

        Args:
            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
                every sample of the dataset. The output type of processors is dict, and each (numeric) field of the dict
                will be used as a field of torch.utils.data.Dataset.
            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only if
                `to_tensor` is True). If the preprocessor is None, the arg columns must have at least one column.
                If the `preprocessors` is not None, the output fields of processors will also be added.
            task_name (str, default None):  task name, refer to :obj:`Tasks` for more details
            data_config (ConfigDict, default None): config dict for model object.
                Attributes of ConfigDict:
                    `preprocessor` (Callable, List[Callable], optional): preprocessors to deal with dataset
                    `type` (str): the type of task
                    `split_config` (dict, optional): get the split config for ExternalDataset
                    `test_mode` (bool, optional): is test mode or not
            to_tensor (bool, default None): whether convert the data types of dataset column(s) to torch.tensor or not.
            format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.

        Returns:
            :class:`torch.utils.data.Dataset`

        z>The function to_torch_dataset requires pytorch to be installedpreprocessorN)r   r   torch)ri   r   format_kwargs)r.   ImportErrorr2   rA   r   updaterV   r   !_to_torch_dataset_with_processorsreset_format
set_format)rG   r   r   r   r   r   r   s          r8   to_torch_datasetzMsDataset.to_torch_dataset+  s    B "## 	P   dk?33 	@>???t{8999'Y???$99w) : E E E K$$&&&K""g] # L L L;r:   
batch_sizeshuffle
collate_fndrop_remaindercollate_fn_args
label_colsprefetchc
           
      *   t                      st          d          ||                     |||||	||          S |t                              d           dS | j                                         | j                            ||||||||	          S )a  Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like
           model.fit() or model.predict().

        Args:
            batch_size (int): Number of samples in a single batch.
            shuffle(bool): Shuffle the dataset order.
            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
                every sample of the dataset. The output type of processors is dict, and each field of the dict will be
                used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn`
                shouldn't be None.
            columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None,
                the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of
                processors will also be added.
            collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If
                the `preprocessors` is None, the `collate_fn` shouldn't be None.
            drop_remainder(bool, default None): Drop the last incomplete batch when loading.
            collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`.
            label_cols (str or List[str], default None): Dataset column(s) to load as labels.
            prefetch (bool, default True): Prefetch data.

        Returns:
            :class:`tf.data.Dataset`

        z?The function to_tf_dataset requires Tensorflow to be installed.N)r   r  r   r   z?The `preprocessors` and the `collate_fn` should`t be both None.)r   r   r   r  )r-   r   _to_tf_dataset_with_processorsr   errorrA   r   to_tf_dataset)
rG   r   r   r   r   r   r   r   r   r  s
             r8   r  zMsDataset.to_tf_dataset]  s    H    	Q   $66-!% 7 ! ! ! LLQ   4  """{(()+! )   	r:   c                 B    | j                                          | j         S rJ   )rA   r   rR   s    r8   to_hf_datasetzMsDataset.to_hf_dataset  s      """{r:   column_mappingc                 h    | j                                          | j                             |          S )a  
        Rename columns and return the underlying hf dataset directly
        TODO: support native MsDataset column rename.
        Args:
            column_mapping: the mapping of the original and new column names
        Returns:
            underlying hf dataset
        )rA   r   rename_columns)rG   r  s     r8   remap_columnszMsDataset.remap_columns  s.     	  """{)).999r:   c                    t          |t                    r|n|g}t                    fd| j        j                                        D             g }g }|rt          t          | j                            fdD             }|D ]<}|                    d  |          	                                D                        =d }	|                                D ]\}
 |	||
                   s4t                              d|
 d           |                    |
           G|                    |
           ]dd l G fdd	j        j        j                  } || j        ||||          S )
Nc                     g | ]}|v |	S r[   r[   )r]   rO   r   s     r8   
<listcomp>z?MsDataset._to_torch_dataset_with_processors.<locals>.<listcomp>  s%     
 
 
#..C...r:   c                 F    i | ]}|t          j        |                   S r[   nparray)r]   r^   samples     r8   ra   z?MsDataset._to_torch_dataset_with_processors.<locals>.<dictcomp>  s)    BBBQ!RXfQi00BBBr:   c                 >    i | ]\  }}|t          j        |          S r[   r  r]   r^   r_   s      r8   ra   z?MsDataset._to_torch_dataset_with_processors.<locals>.<dictcomp>  s6     < < <A  < < <r:   c                     t          j        | j        t           j                  p#t          j        | j        t           j                  S rJ   )r  
issubdtypedtypeintegerfloating)r   s    r8   is_numpy_numberzDMsDataset._to_torch_dataset_with_processors.<locals>.is_numpy_number  s8    }U["*== .KB. B. .r:   zData of column z  is non-numeric, will be removedr   c                   <     e Zd Zdef fdZd ZfdZd Z xZS )AMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDatasetr   c                     t          t                                                     || _        || _        || _        || _        || _        || _        d S rJ   )	superr<   rH   r   preprocessor_listr   retained_numeric_columnsretained_unumeric_columnsr   )rG   r   r   r!  r"  r   r   	__class__s          r8   rH   zJMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__init__  sP     i  ))+++&):&!*0H-1J.&r:   c                 *    t          | j                  S rJ   )r4   r   rR   s    r8   rS   zIMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__len__  s    4<(((r:   c                 @    | j         r                    |          S |S rJ   )r   	as_tensor)rG   xr   s     r8   type_converterzPMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.type_converter  s$    >  ??1---Hr:   c                      j         |          fd j        D             } j        D ]\} |                                          D ]<\  }} j        r	| j        v r                     |          ||<   .| j        v r|||<   =]|S )Nc                 h    i | ].}j         r	|j        v |                    |                   /S r[   )r   r!  r(  )r]   r^   	item_dictrG   s     r8   ra   zaMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__getitem__.<locals>.<dictcomp>  sP       $. D999 t**9Q<88999r:   )r   r   r   rh   r   r!  r(  r"  )rG   indexresr   r^   r_   r+  s   `     @r8   rP   zMMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__getitem__  s     L/	    !\  
 %)$: ' 'L ,Y 7 7 = = ? ? ' '1 $ ' !T%B B B%)%8%8%;%;CFF$"@@@%&CF' 
r:   )	__name__
__module____qualname__r   rH   rS   r(  rP   __classcell__)r#  r   s   @r8   MsMapDatasetr    s|        	' 	' 	' 	' 	' 	' 	') ) )          r:   r2  )r2   rD   r9   rA   rB   rE   re   rf   r   rh   r   r   appendr   utilsdatar   )rG   r   r   r   r   r!  r"  
sample_res	processorr  r^   r2  r  r   s     `         @@r8   r   z+MsDataset._to_torch_dataset_with_processors  s    .84.! .! 6MM'4o 	 g&&
 
 
 
;/4466
 
 
 $& $&! 	3$t{++,,FBBBB'BBBJ. = =	!!< <!*6!2!2!8!8!:!:< < <= = = =. . .  __&& 3 3&z!}55 NNM!MMMO O O-44Q777(//2222$	 $	 $	 $	 $	 $	 $	5;+3 $	 $	 $	L |DK):45w	K K 	Kr:   c                     t          |t                    r|n|gt                    t          |          }t          t          |z                       fd j        j                                        D             dd lj        j	        
                    t          j        t           j                  t          j                            }|r(|                    t           j                            }d fd	 dd                                                  d j                  g          fd	            }	dd
lm}
 |                    |	|
          }rfd}|                    |          }n)t          |          dk    r|                    d           }|dk    r|                    ||          }|r|                    |
          }|S )Nc                     g | ]}|v |	S r[   r[   )r]   rO   cols_to_retains     r8   r  z<MsDataset._to_tf_dataset_with_processors.<locals>.<listcomp>  s*     
 
 
#:O:OC:O:O:Or:   r   )r  )buffer_sizeFc                 :    t                       fdD             }D ]G}|                    d  |j                                                            D                        H|r|S t	          t          |                                                    S )Nc                 \    i | ](}|t          j        j                 |                   )S r[   )r  r  rA   )r]   r^   irG   s     r8   ra   zJMsDataset._to_tf_dataset_with_processors.<locals>.func.<locals>.<dictcomp>  s0    LLLa1bht{1~a011LLLr:   c                 >    i | ]\  }}|t          j        |          S r[   r  r  s      r8   ra   zJMsDataset._to_tf_dataset_with_processors.<locals>.func.<locals>.<dictcomp>  s6       1 rx{{  r:   )intr   rA   rh   tuplerD   rg   )r>  return_dictr-  r   r   retained_columnsrG   s   `   r8   funcz6MsDataset._to_tf_dataset_with_processors.<locals>.func  s    AALLLLL;KLLLC 1  

   ,T[^ < < B B D D        
cjjll++,,,r:   T)input_signaturec                                          | gfd                                D                       fdt                    D             S )Nc                 N    g | ]!}j                             |j                  "S r[   )dtypesas_dtyper  )r]   valtfs     r8   r  zTMsDataset._to_tf_dataset_with_processors.<locals>.fetch_function.<locals>.<listcomp>-  s;        I&&sy11  r:   )inpToutc                 (    i | ]\  }}||         S r[   r[   )r]   r>  rO   outputs      r8   ra   zTMsDataset._to_tf_dataset_with_processors.<locals>.fetch_function.<locals>.<dictcomp>2  s#    GGGvq#CGGGr:   )numpy_functionrg   	enumerate)r>  rO  rD  r6  rK  s    @r8   fetch_functionz@MsDataset._to_tf_dataset_with_processors.<locals>.fetch_function(  s}    &&C   )0022   '  F HGGG:1F1FGGGGr:   )AUTOTUNE)num_parallel_callsc                 P   fd|                                  D             }t          |           dk    r.t          t          |                                                     } t          |          dk    r.t          t          |                                                    }| |fS )Nc                 $    i | ]\  }}|v 	||S r[   r[   )r]   rO   tensorr   s      r8   ra   z_MsDataset._to_tf_dataset_with_processors.<locals>.split_features_and_labels.<locals>.<dictcomp>:  s3       #VcZ>O>O >O>O>Or:   rY   )rh   r4   re   rf   rg   )input_batchlabelsr   s     r8   split_features_and_labelszKMsDataset._to_tf_dataset_with_processors.<locals>.split_features_and_labels9  s       '2'8'8':':   {##q(("&tK,>,>,@,@'A'A"B"BKv;;!##!$v}}"7"788F"F**r:   rY   c                 ^    t          t          |                                                     S rJ   )re   rf   rg   )r'  s    r8   <lambda>z:MsDataset._to_tf_dataset_with_processors.<locals>.<lambda>G  s    $tAHHJJ7G7G2H2H r:   )r   )F)r2   rD   r9   r5   rA   rB   rE   
tensorflowr5  r   from_tensor_slicesr  aranger4   int64r   function
TensorSpectensorflow.data.experimentalrS  mapbatchr  )rG   r   r   r   r   r  r   r   
tf_datasetrR  rS  rZ  r:  rD  r   rC  r6  rK  s   `     `     @@@@@@r8   r  z(MsDataset._to_tf_dataset_with_processors   sy    .84.! .! 6MM'4o 	 !,,
g&&c*w"67788
 
 
 
;/4466
 
 
 	 W_77Ic$+&&bh7779 9
 	J#++DK8H8H+IIJ	- 	- 	- 	- 	- 	- 	- 	- T!T]]
	bmmD"(&C&C%D	E	E		H 		H 		H 		H 		H 		H 
F	E		H 	:99999^^x $ 9 9
 	J	+ 	+ 	+ 	+ 	+ $(ABBJJ\\Q#(H(HIIJ>>#))> * ; ;J  	7#,,X66Jr:   c                 j   t                      st          d          |sdS d| _        |d|v r|                    d          }|t          j        k    rdnd}|                    d|           }|Dt          |t          j	                  rt          |j	        j                  nt          d          }|                    t          |	                     |j        }d
|v r|                    d
          }t!          j        |          }d|v r|                    d          }|)t          |d          r|j        }	|	rt'          |	|          }t)          | j        t,                    r_|                    t          |                     |                    | j        j                   t1          ||j                  | _        dS |3|                    dd          }
|                     ||
          | _        n4| j                                         | j                            d           dS )a  Convert the input datasets to specific custom datasets by given model configuration and preprocessor.

        Args:
            custom_cfg (Config): The model configuration for custom datasets.
            preprocessor (Preprocessor, Optional): Preprocessor for data samples.
            mode (str, Optional): See modelscope.utils.constant.ModeKeys

        Returns:
            `MsDataset`
        z?The function to_custom_dataset requires pytorch to be installedNTmodetrainrJ  zdataset.)ri   )rh  taskfieldr   )r   )cfgr   r   )r   r   r   )r.   r   rF   r   r)   TRAINsafe_gethasattrr%   modelr    ri   r   dictrj  popr*   find_field_by_taskr   r   r2   rA   r   rV   r   r   r   r   )rG   r|   r   rh  kwargs
ds_cfg_keydata_cfgr   
field_namepreprocessor_cfgr   s              r8   r   zMsDataset.to_custom_datasetP  sb     "## 	Q    	F  <zz&)) !% 6 6WWE
&&'>*'>'>??AHL.B0 B0 Kzz'7'<====5?T5J5J5J $((( O	V

6**I-i88
fG,,JGJ$G$G)6 P12BJOO dk?33 	OODl;;;<<<OODK5666.
9 9 9DKF#

;55I@@*i A A ADKK K$$&&&K"""000r:   rJ   )NNNNT)NNNNNNT)NT)NTNN)NN)>r.  r/  r0  __doc__rA   r=   r   __annotations__r   r   r   r   r   r	   r3   rH   rL   rP   rS   propertyr>   rV   classmethodr   rq  rj   r   ro   staticmethodr"   r#   r(   r   r'   r   r!   r   rD   r
   r   r   boolr@  r   r+   r   r   r   r   r   r   r   r    r   r   r   r  r  r  r   r  r   r[   r:   r8   r<   r<   3   s          F481888
 *.
 
#G_o$9%: ;
 "#
 
 
 
               X   X  '+ $Wk?%JK #/4T;5F/G   [. 
 %)	 #(+)>)8:M*N $O "	 .343D-E	   [6  $= $!9"o%)#"& KO0<#4'+(-+,'-vxx#,1,1+dQ dQCI&dQC=dQ dQ #	dQ
 d^dQ c]dQ }dQ 3-dQ U3#*3c6>sm7D 1E ,E $F$F G HdQ  -dQ C=dQ 8$dQ   ~!dQ" $C=#dQ$ V$%dQ& }'dQ( $D>)dQ* $D>+dQ. 
t["77	8/dQ dQ dQ \dQL 
 (A%=+/'(260:0DBL BLBL BL BL  }	BL
 c]BL $C=BL  }BL "*$BL "*-BL JNBL BL BL \BLH  .F/3-1	$F $FS $F"$F%c]$F  (}$F &c]	$F 7;	$F $F $F \$FL  /G04.2"'$P $Pc $P$'$P&sm$P !)$P 'sm	$P
  $P
 -1$P $P $P \$PL  +D(@ C  "3- ! FI   \4 *.9="&0 0sDI~&0 XtH~560 	0
  0 0 0 0 0l :>)-##*.,0@ @@ @ XtH~56	@
 sDI~&@ @ @ c3h@ #tCy.)@ @ @ @ @Dw    
:DcN 
:w 
: 
: 
: 
: *.	OK OKXtH~56OK sDI~&OK 	OK OK OK OKl  $,0)-N NN N XtH~56	N
 N N #tCy.)N sDI~&N N N Nd (,#B B&,B B B B B Br:   r<   )Fr   rb   typingr   r   r   r   r   r   r	   r
   r   numpyr  r   r   r   r   r   r   datasets.packaged_modulesr   modelscope.hub.repositoryr   4modelscope.msdatasets.context.dataset_context_configr   5modelscope.msdatasets.data_loader.data_loader_managerr   r   r   r   !modelscope.msdatasets.dataset_clsr   r   9modelscope.msdatasets.dataset_cls.custom_datasets.builderr   (modelscope.msdatasets.utils.delete_utilsr   ,modelscope.msdatasets.utils.hf_datasets_utilr   (modelscope.msdatasets.utils.upload_utilsr   modelscope.preprocessorsr   modelscope.utils.configr   r    r   r!   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   modelscope.utils.file_utilsr,   modelscope.utils.import_utilsr-   r.   modelscope.utils.loggerr/   r   r9   r<   r[   r:   r8   <module>r     sI   
			 % % % % % % % % % % % % % % % % % % % % % %    + + + + + + + + + + + + + + @ @ @ @ @ @ 7 7 7 7 7 7                F F F F F F F F      I I I I I I N N N N N N I I I I I I 7 7 7 7 7 7 6 6 6 6 6 6 6 6 8 8 8 8 8 8D D D D D D D D D D D D D D D D D D D D D D D D
 9 8 8 8 8 8 M M M M M M M M . . . . . .	    _ _ _ _ _ _ _ _ _ _r:   