o
    z3i,                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ  d
dl!m"Z"m#Z#m$Z$m%Z%m&Z& d
dl'm(Z( ee)e*e*f e)e* e)d f Z+e*ej,Z-e.e/Z0G dd de*Z1G dd de2Z3dZ4ej,ddgej5g dej6g diZ7dZ8ej9e:dk rdd gZ;g d!Z<nej9e:d"k rd#d gZ;g d$Z<nd%d&gZ;g d'Z<ej,ej5ej6gZ=d(d) e=D Z>d*d) e=D Z?ej,d+giZ@d,d-giZAe4gZBeAe?e>e@gZCd.ZDg d/ZEd0e*d1eFfd2d3ZGd4eeHeIe*f d1eHe*eeIe* d5f f fd6d7ZJd8e*d0e*d1eFfd9d:ZKd8e*d0e*d1eFfd;d<ZLd=e
e*geIe* f d1eHe*eIe* f fd>d?ZM		dUd0e*d@e*dAeeIe*  dBee d1eIe* f
dCdDZNdVd@e*dBee d1eHe*eIe* f fdEdFZO	dVdGe*dBee d1e+fdHdIZP		dUdJeIe* dBee dKeeQ d1eIe+ fdLdMZRG dNd5 d5eIe* ZSG dOdP dPeHe*eSf ZTG dQdR dReIe* ZUG dSdT dTeHe*eUf ZVdS )W    N)partial)	has_magic)PathPurePath)CallableOptionalUnion)	url_to_fs)HfFileSystem)version)
thread_map   )config)DownloadConfig)	_split_re)Split)logging)tqdm)!_prepare_path_and_storage_optionsis_local_pathis_relative_path	xbasenamexjoin)string_to_dict c                   @      e Zd ZdS )UrlN__name__
__module____qualname__r   r   r   a/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/datasets/data_files.pyr   !       r   c                   @   r   )EmptyDatasetErrorNr   r   r   r   r!   r#   %   r"   r#   zFdata/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*traintraining)
validationvaliddevval)testtestingeval
evaluationz-._ 0-9z2023.9.0z**[{sep}/]{keyword}[{sep}]*z{keyword}[{sep}]*)z{keyword}/**z{keyword}[{sep}]*/**z**[{sep}/]{keyword}/**z**[{sep}/]{keyword}[{sep}]*/**z	2023.12.0z**/*[{sep}/]{keyword}[{sep}]*)z{keyword}/**/*z{keyword}[{sep}]*/**/*z**/*[{sep}/]{keyword}/**/*z"**/*[{sep}/]{keyword}[{sep}]*/**/*z**/{keyword}[{sep}]*z**/*[{sep}]{keyword}[{sep}]*)z**/{keyword}/**z**/{keyword}[{sep}]*/**z**/*[{sep}]{keyword}/**z**/*[{sep}]{keyword}[{sep}]*/**c                 C       i | ]}|d d t | D qS )c                 S   $   g | ]}t D ]	}|j|td qqS )keywordsep)"KEYWORDS_IN_FILENAME_BASE_PATTERNSformatNON_WORDS_CHARS.0r1   patternr   r   r!   
<listcomp>L       <dictcomp>.<listcomp>SPLIT_KEYWORDSr7   splitr   r   r!   
<dictcomp>K       r@   c                 C   r.   )c                 S   r/   r0   )"KEYWORDS_IN_DIR_NAME_BASE_PATTERNSr4   r5   r6   r   r   r!   r9   T   r:   r;   r<   r>   r   r   r!   r@   S   rA   z**logsz	**/*.evalz*[])z	README.mdzconfig.jsonzdataset_info.jsonzdataset_infos.jsonzdummy_data.zipzdataset_dict.jsonr8   returnc                    s   t  fddtD S )Nc                 3   s    | ]}| v V  qd S Nr   )r7   wildcard_characterr8   r   r!   	<genexpr>v       z%contains_wildcards.<locals>.<genexpr>)anyWILDCARD_CHARACTERSrG   r   rG   r!   contains_wildcardsu   s   rL   patternsDataFilesListc                 C   s   t | trdd |  D S t | trt| giS t | trntdd | D rj| D ]"}t |trCt|dkrCd|v rCt |dttfsJt	d| q(d	d
 | D }tt
|t|krct	d| dd | D S t| iS tt| S )a/  
    Take the data_files patterns from the user, and format them into a dictionary.
    Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
    The default split is "train".

    Returns:
        patterns: dictionary of split_name -> list of patterns
    c                 S   s*   i | ]\}}t |t|tr|n|gqS r   str
isinstancelist)r7   keyvaluer   r   r!   r@      s   * z%sanitize_patterns.<locals>.<dictcomp>c                 s   s    | ]}t |tV  qd S rE   )rQ   dictr7   r8   r   r   r!   rH          z$sanitize_patterns.<locals>.<genexpr>   r?   pathzInvalid format for data_files entry. Each item must be a dictionary with the structure {'split': <split_name>, 'path': <path_or_list_of_paths>}.
Received: c                 S   s   g | ]}|d  qS r?   r   rV   r   r   r!   r9          z%sanitize_patterns.<locals>.<listcomp>z*Some splits are duplicated in data_files: c                 S   s6   i | ]}t |d  t|d tr|d n|d gqS )r?   rY   rO   rV   r   r   r!   r@      s    ()rQ   rU   itemsrP   SANITIZED_DEFAULT_SPLITrR   rJ   lenget
ValueErrorsetsanitize_patterns)rM   r8   splitsr   r   r!   rb   y   s6   
	


	rb   matched_rel_pathc                 C   s<   dd t | jjD }dd t |jjD }t|t|kS )u  
    When a path matches a pattern, we additionally check if it's inside a special directory
    we ignore by default (if it starts with a double underscore).

    Users can still explicitly request a filepath inside such a directory if "__pycache__" is
    mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── __pycache__
            └── b.txt

    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
    False
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
    False
    c                 S      g | ]	}| d r|qS __
startswithr7   partr   r   r!   r9          z6_is_inside_unrequested_special_dir.<locals>.<listcomp>c                 S   re   rf   rh   rj   r   r   r!   r9      rl   )r   parentpartsr^   )rd   r8   data_dirs_to_ignore_in_pathdata_dirs_to_ignore_in_patternr   r   r!   "_is_inside_unrequested_special_dir   s   rq   c                 C   s8   dd t | jD }dd t |jD }t|t|kS )u9  
    When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
    a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.

    Users can still explicitly request a filepath that is hidden or is inside a hidden directory
    if the hidden part is mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── a.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
    False
    c                 S   (   g | ]}| d rt|d hks|qS .ri   ra   rj   r   r   r!   r9          
zS_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir.<locals>.<listcomp>c                 S   rr   rs   ru   rj   r   r   r!   r9      rv   )r   rn   r^   )rd   r8   hidden_directories_in_pathhidden_directories_in_patternr   r   r!   ?_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir   s   5ry   pattern_resolverc           	         sv  t D ]odd}z| |}W n	 ty   Y qw t|dkrqt |D ]}tt|t}|dus6J |d  q%tdd D rRt	dt
 d	 d
fddtD tdd tD   }fdd|D   S qtD ]< g }  D ]&\}}|D ]}z| |}W n	 ty   Y qw t|dkr||  nqq||r fdd|D   S qttd| d|  )a+  
    Get the default pattern from a directory or repository by testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    z{split}*r   Nr?   c                 s   s    | ]
}t t| V  qd S rE   )rematchr   r>   r   r   r!   rH     s    z+_get_data_files_patterns.<locals>.<genexpr>zSplit name should match 'z'' but got 'z'.c                    s   g | ]
}| v rt |qS r   rP   r>   )rc   r   r!   r9         z,_get_data_files_patterns.<locals>.<listcomp>c                 S   s   h | ]}t |qS r   r~   r>   r   r   r!   	<setcomp>  r[   z+_get_data_files_patterns.<locals>.<setcomp>c                    s   i | ]
}| j |d gqS )rZ   )r4   r>   )split_patternr   r!   r@     r   z,_get_data_files_patterns.<locals>.<dictcomp>c                    s   i | ]}| | qS r   r   r>   )patterns_dictr   r!   r@   )  s    zCouldn't resolve pattern z with resolver )ALL_SPLIT_PATTERNSreplaceFileNotFoundErrorr^   ra   r   r   addrJ   r`   r   DEFAULT_SPLITSsortedALL_DEFAULT_PATTERNSr\   append)	rz   r8   
data_filespp_partssorted_splitsnon_empty_splitsr?   rM   r   )r   r   rc   r!   _get_data_files_patterns  sL   
r   	base_pathallowed_extensionsdownload_configc                    s  t | r
t|| } nt| rtj| d tj }nd}t| |d\} }t| fi |\}}t	t
t| h }d| v rB| dd nt|jtrK|jn|jd }|dkrX|d nd}	i }
|dkrdd|
d< | d	^}}g }|j|fd
di|
 D ]E\}}|d dks|drtjtj|rt||v rqzt||rqzt||rqzd|v r|n|	| }|rd	|g| }|| qz dur fdd|D }t|t|k rtt	|t	| }td|  d|  n|}|sd|  d} dur|dt  7 }t||S )a  
    Resolve the paths and URLs of the data files from the pattern passed by the user.

    You can use patterns to resolve multiple local files. Here are a few examples:
    - *.csv to match all the CSV files at the first level
    - **.csv to match all the CSV files at any level
    - data/* to match all the files inside "data"
    - data/** to match all the files inside "data" and its subdirectories

    The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
    Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
    other than a forward slash /.

    More generally:
    - '*' matches any character except a forward-slash (to match just the file or directory name)
    - '**' matches any character including a forward-slash /

    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
    The same applies to special directories that start with a double underscore like "__pycache__".
    You can still include one if the pattern explicitly mentions it:
    - to include a hidden file: "*/.hidden.txt" or "*/.*"
    - to include a hidden directory: ".hidden/*" or ".*/*"
    - to include a special directory: "__special__/*" or "__*/*"

    Example::

        >>> from datasets.data_files import resolve_pattern
        >>> base_path = "."
        >>> resolve_pattern("docs/**/*.py", base_path)
        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']

    Args:
        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
            The paths can be absolute or relative to base_path.
            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
        base_path (str): Base path to use when resolving relative paths.
        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
        download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
    Returns:
        List[str]: List of paths or URLs to the local or remote files that match the patterns.
    r    r   z://filehfFexpand_infoz::detailTtypeislinkNc                    s8   g | ]}t  fd dt|ddd D r|qS )c                 3   s    | ]	}d |  v V  qdS )rt   Nr   )r7   suffixr   r   r!   rH     s    z-resolve_pattern.<locals>.<listcomp>.<genexpr>rt   r   N)rJ   r   r?   )r7   filepathr   r   r!   r9     s    &z#resolve_pattern.<locals>.<listcomp>z Some files matched the pattern 'z-' but don't have valid data file extensions: zUnable to find ''z with any supported extension )r   r   r   osrY   
splitdriver2   r   r	   ra   FILES_TO_IGNOREr   r?   rQ   protocolrP   globr\   r_   isfilerealpathrq   ry   joinr   r^   rR   loggerinfor   )r8   r   r   r   storage_optionsfs
fs_patternfiles_to_ignorer   protocol_prefixglob_kwargs_	rest_hopsmatched_pathsr   r   outinvalid_matched_files	error_msgr   r   r!   resolve_pattern-  s^   0"*



r   c                 C   s:   t t| |d}zt|W S  ty   td|  ddw )uA
  
    Get the default pattern from a directory testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    Some examples of supported patterns:

    Input:

        my_dataset_repository/
        ├── README.md
        └── dataset.csv

    Output:

        {'train': ['**']}

    Input:

        my_dataset_repository/
        ├── README.md
        ├── train.csv
        └── test.csv

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train.csv
            └── test.csv

        my_dataset_repository/
        ├── README.md
        ├── train_0.csv
        ├── train_1.csv
        ├── train_2.csv
        ├── train_3.csv
        ├── test_0.csv
        └── test_1.csv

    Output:

        {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
         'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train/
            │   ├── shard_0.csv
            │   ├── shard_1.csv
            │   ├── shard_2.csv
            │   └── shard_3.csv
            └── test/
                ├── shard_0.csv
                └── shard_1.csv

    Output:

        {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
         'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train-00000-of-00003.csv
            ├── train-00001-of-00003.csv
            ├── train-00002-of-00003.csv
            ├── test-00000-of-00001.csv
            ├── random-00000-of-00003.csv
            ├── random-00001-of-00003.csv
            └── random-00002-of-00003.csv

    Output:

        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    )r   r   zThe directory at z doesn't contain any data filesN)r   r   r   r   r#   )r   r   resolverr   r   r!   get_data_patterns  s   T
r   	data_filec                 C   s   |  tjr,ttj|jd}d| ttjd d   } | d|  dr%dndd} | }nt| |d\} }t| fi |\}}t	|trS|
|}t|d	rS|j|jfS ||}d
D ]}||v rit|| f  S qZdS )N)endpointtokenhf://r   z	/resolve/zhf://buckets//@r   revision)ETagetagmtimer   )ri   r   HF_ENDPOINTr
   r   r^   r   r   r	   rQ   resolve_pathhasattrrepo_idr   r   rP   )r   r   r   fs_pathr   resolved_pathr   rS   r   r   r!   _get_single_origin_metadata  s"   



r   r   max_workersc                    st   |d ur|nt j}tdd | D r& fddt| dt| dkp!d dD S ttt d| |tdt| dkp7d d	S )
Nc                 s   s    | ]}d |v V  qdS )r   Nr   r7   r   r   r   r!   rH     rI   z'_get_origin_metadata.<locals>.<genexpr>c                    s   g | ]}t | d qS )r   )r   r   r   r   r!   r9     s    
z(_get_origin_metadata.<locals>.<listcomp>zResolving data files   )descdisabler   )r   
tqdm_classr   r   )r   &HF_DATASETS_MULTITHREADING_MAX_WORKERSallhf_tqdmr^   r   r   r   )r   r   r   r   r   r!   _get_origin_metadata
  s"   
	
r   c                       s0  e Zd ZdZdee dee ddf fddZdd	d
Ze				ddee de
jjdee deee  dee dd fddZe				ddee dee deee  dee dd f
ddZe				ddee dee deee  dee dd f
ddZddddeee  deee  dd fddZ  ZS )rN   a  
    List of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns:
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover, DataFilesList has an additional attribute ``origin_metadata``.
    It can store:
    - the last modified time of local files
    - ETag of remote files
    - commit sha of a dataset repository

    Thanks to this additional attribute, it is possible to hash the list
    and get a different hash if and only if at least one file changed.
    This is useful for caching Dataset objects that are obtained from a list of data files.
    r   origin_metadatarD   Nc                       t  | || _d S rE   )super__init__r   )selfr   r   	__class__r   r!   r   9  s   
zDataFilesList.__init__otherc                 C      t g | || j|j S rE   )rN   r   r   r   r   r   r!   __add__=     zDataFilesList.__add__rM   dataset_infor   r   r   c                 C   s6   d|j  d|j d|pd d}| j||||dS )Nzhf://datasets/r   r   r   r   r   r   )idsharstripfrom_patterns)clsrM   r   r   r   r   r   r   r!   from_hf_repo@  s   $	zDataFilesList.from_hf_repoc                 C   s,   |d ur|nt    }| j||||dS Nr   )r   resolveas_posixr   )r   rM   r   r   r   r   r   r!   from_local_or_remoteN  s   z"DataFilesList.from_local_or_remotec              	   C   st   |d ur|nt    }g }|D ]}z|t||||d W q ty.   t|s, Y qw t||d}| ||S Nr   r   )r   r   r   extendr   r   r   r   )r   rM   r   r   r   r   r8   r   r   r   r!   r   [  s&   
zDataFilesList.from_patterns
extensions
file_namesr   r   c                   s   g  |rd dd |D } td| d |r2d dd |D } td| d  rBt fd	d
| D | jdS tt| | jdS )N|c                 s       | ]}t |V  qd S rE   r|   escape)r7   extr   r   r!   rH   z  rW   z'DataFilesList.filter.<locals>.<genexpr>z.*(z	)(\..+)?$c                 s   r   rE   r   )r7   fnr   r   r!   rH   }  rW   z.*[\/]?(z)$c                    s&   g | ] t  fd dD r qS )c                 3   s    | ]}|  V  qd S rE   )r}   rV   r   r   r!   rH     rW   z2DataFilesList.filter.<locals>.<listcomp>.<genexpr>)rJ   )r7   rM   r   r!   r9     s   & z(DataFilesList.filter.<locals>.<listcomp>)r   )r   r   r|   compilerN   r   rR   )r   r   r   ext_pattern
fn_patternr   r   r!   filteru  s   zDataFilesList.filter)r   rN   rD   rN   NNN)r   r   r    __doc__rR   rP   SingleOriginMetadatar   r   classmethodhuggingface_hubhf_apiDatasetInfor   r   r   r   r   r   __classcell__r   r   r   r!   rN   '  sv    "





c                   @   s0  e Zd ZdZe			ddeeeee e	f f de
e de
ee  de
e dd f
dd	Ze			ddeeeee e	f f d
ejjde
e de
ee  de
e dd fddZe			ddeeeee e	f f de
e de
ee  de
e dd f
ddZdddde
ee  de
ee  dd fddZdS )DataFilesDicta  
    Dict of split_name -> list of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns :
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover, each list is a DataFilesList. It is possible to hash the dictionary
    and get a different hash if and only if at least one file changed.
    For more info, see [`DataFilesList`].

    This is useful for caching Dataset objects that are obtained from a list of data files.

    Changing the order of the keys of this dictionary also doesn't change its hash.
    NrM   r   r   r   rD   c                 C   @   |  }|  D ]\}}t|tr|ntj||||d||< q|S r   )r\   rQ   rN   r   r   rM   r   r   r   r   rS   patterns_for_keyr   r   r!   r        
z"DataFilesDict.from_local_or_remoter   c           	      C   sB   |  }|  D ]\}}t|tr|n	tj|||||d||< q|S )N)r   r   r   r   )r\   rQ   rN   r   )	r   rM   r   r   r   r   r   rS   r  r   r   r!   r     s   	zDataFilesDict.from_hf_repoc                 C   r  r   )r\   rQ   rN   r   r  r   r   r!   r     r	  zDataFilesDict.from_patternsr   r   r   c                C   s2   t |  }|  D ]\}}|j||d||< q	|S )Nr   )r   r\   r   )r   r   r   r   rS   data_files_listr   r   r!   r     s   
zDataFilesDict.filterr   )r   r   r    r   r   rU   rP   r   rR   rN   r   r   r   r  r  r  r   r   r   r   r   r   r!   r    sr    




r  c                       s   e Zd ZdZdee deeee   f fddZdd Ze		ddee deee  d	d fd
dZ
	ddedee d	dfddZdee d	d fddZ  ZS )DataFilesPatternsListz
    List of data files patterns (absolute local paths or URLs).
    For each pattern there should also be a list of allowed extensions
    to keep, or a None ot keep all the files for the pattern.
    rM   r   c                    r   rE   )r   r   r   )r   rM   r   r   r   r!   r     s   
zDataFilesPatternsList.__init__c                 C   r   rE   )rN   r   r   r   r   r!   r     r   zDataFilesPatternsList.__add__NrD   c                 C   s   | ||gt | S rE   )r^   )r   rM   r   r   r   r!   r     s   z#DataFilesPatternsList.from_patternsr   r   rN   c              	   C   s   |d ur|nt    }g }t| | jD ]\}}z|t||||d W q ty4   t|s2 Y qw t	||d}t
||S r   )r   r   r   zipr   r   r   r   r   r   rN   )r   r   r   r   r8   r   r   r   r   r!   r     s&   
zDataFilesPatternsList.resolver   c                    s   t |  fdd| jD S )Nc                    s   g | ]}|  qS r   r   )r7   r   r   r   r!   r9     r[   z;DataFilesPatternsList.filter_extensions.<locals>.<listcomp>)r  r   )r   r   r   r  r!   filter_extensions  s   z'DataFilesPatternsList.filter_extensionsrE   )r   r   r    r   rR   rP   r   r   r   r   r   r   r   r  r  r   r   r   r!   r    s4    

r  c                   @   sv   e Zd ZdZe	ddeeee f deee  dd fddZ		dded	ee
 dd
fddZdee dd fddZdS )DataFilesPatternsDictz[
    Dict of split_name -> list of data files patterns (absolute local paths or URLs).
    NrM   r   rD   c                 C   s<   |  }|  D ]\}}t|tr|ntj||d||< q|S )Nr   )r\   rQ   r  r   )r   rM   r   r   rS   r  r   r   r!   r     s   z#DataFilesPatternsDict.from_patternsr   r   r  c                 C   s,   t  }|  D ]\}}|||||< q|S rE   )r  r\   r   )r   r   r   r   rS   data_files_patterns_listr   r   r!   r   /  s   zDataFilesPatternsDict.resolver   c                 C   s.   t |  }|  D ]\}}||||< q	|S rE   )r   r\   r  )r   r   r   rS   r  r   r   r!   r  9  s   
z'DataFilesPatternsDict.filter_extensionsrE   )r   r   r    r   r   rU   rP   rR   r   r   r   r   r  r   r   r   r!   r    s(    


r  )NNrE   )Wr   r|   	functoolsr   r   r   pathlibr   r   typingr   r   r   r  fsspec.corer	   r
   	packagingr   tqdm.contrib.concurrentr   r   r   downloadr   namingr   rc   r   utilsr   r   r   utils.file_utilsr   r   r   r   r   utils.py_utilsr   tuplerP   r   TRAINr]   
get_loggerr   r   r   r   r#   SPLIT_PATTERN_SHARDED
VALIDATIONTESTr=   r5   FSSPEC_VERSIONparser3   rB   r   "DEFAULT_PATTERNS_SPLIT_IN_FILENAME"DEFAULT_PATTERNS_SPLIT_IN_DIR_NAMEDEFAULT_PATTERNS_ALLDEFAULT_PATTERNS_LOGSr   r   rK   r   boolrL   rU   rR   rb   rq   ry   r   r   r   r   intr   rN   r  r  r  r   r   r   r!   <module>   s    








0)!,>/

(j]

a]5