o
    z3iG                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZ d dlmZ d dlmZmZmZ ddlmZ ddlmZ dd	lmZ erXd dlZd dlZeeZG d
d dZG dd de
ZG dd dee
ZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&G d d! d!eZ'G d"d# d#Z(dS )$    N)ABCabstractmethod)Path)TYPE_CHECKINGOptionalUnion   )config   )FileLock)
get_loggerc                   @   s`   e Zd Zddee fddZdedefddZd	ed
edefddZdded
edefddZ	dS )ExtractManagerN	cache_dirc                 C   s&   |r
t j|tjntj| _t| _d S N)	ospathjoinr	   EXTRACTED_DATASETS_DIREXTRACTED_DATASETS_PATHextract_dir	Extractor	extractor)selfr    r   d/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/datasets/utils/extract.py__init__   s   
zExtractManager.__init__r   returnc                 C   s,   ddl m} tj|}tj| j||S )Nr
   )hash_url_to_filename)
file_utilsr   r   r   abspathr   r   )r   r   r   abs_pathr   r   r   _get_output_path"   s   zExtractManager._get_output_pathoutput_pathforce_extractc                 C   s*   |pt j| ot j|ot | S r   )r   r   isfileisdirlistdir)r   r"   r#   r   r   r   _do_extract*   s   $zExtractManager._do_extractF
input_pathc                 C   s>   | j |}|s
|S | |}| ||r| j ||| |S r   )r   infer_extractor_formatr!   r'   extract)r   r(   r#   extractor_formatr"   r   r   r   r*   /   s   
zExtractManager.extractr   F)
__name__
__module____qualname__r   strr   r!   boolr'   r*   r   r   r   r   r      s
    r   c                   @   s\   e Zd Zeedeeef defddZ	e
edeeef deeef ddfdd	ZdS )
BaseExtractorr   r   c                 K      d S r   r   clsr   kwargsr   r   r   is_extractable:      zBaseExtractor.is_extractabler(   r"   Nc                 C   r3   r   r   )r(   r"   r   r   r   r*   >   r8   zBaseExtractor.extract)r-   r.   r/   classmethodr   r   r   r0   r1   r7   staticmethodr*   r   r   r   r   r2   9   s    .r2   c                   @   s`   e Zd ZU g Zee ed< edee	e
f defddZeddee	e
f dedefd	d
ZdS )MagicNumberBaseExtractormagic_numbersr   magic_number_lengthc                 C   s8   t | d}||W  d    S 1 sw   Y  d S )Nrb)openread)r   r=   fr   r   r   read_magic_numberF   s   $z*MagicNumberBaseExtractor.read_magic_number    magic_numberr   c                    sV    st dd | jD }z| || W n
 ty   Y dS w t fdd| jD S )Nc                 s   s    | ]}t |V  qd S r   )len.0cls_magic_numberr   r   r   	<genexpr>N   s    z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>Fc                 3   s    | ]}  |V  qd S r   )
startswithrF   rD   r   r   rI   S   s    )maxr<   rB   OSErrorany)r5   r   rD   r=   r   rK   r   r7   K   s   z'MagicNumberBaseExtractor.is_extractableNrC   )r-   r.   r/   r<   listbytes__annotations__r:   r   r   r0   intrB   r9   r1   r7   r   r   r   r   r;   C   s   
 &r;   c                   @   st   e Zd Zedeeef defddZe	de
jdeeef fddZe	d	eeef deeef dd
fddZd
S )TarExtractorr   r   c                 K   s
   t |S r   )tarfile
is_tarfiler4   r   r   r   r7   W   s   
zTarExtractor.is_extractablemembersr"   c                 #   s    dt ttf dtfdddtdtdtffdd dtjdtdtf fd	d
}|}| D ]D} |j|rCtd|j d q0|	 rZ|||rZtd|j d|j
  q0| rq|||rqtd|j d|j
  q0|V  q0dS )a  
        Fix for CVE-2007-4559
        Desc:
            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
        From: https://stackoverflow.com/a/10077309
        r   r   c                 S      t jt j| S r   r   r   realpathr   r   r   r   r   resolvedg      z*TarExtractor.safemembers.<locals>.resolvedbasec                        t j|| | S r   r   r   r   rJ   r   r^   r\   r   r   badpathj      z)TarExtractor.safemembers.<locals>.badpathinfoc                    s*   t j|t j| j} | j|dS N)r^   )r   r   r   dirnamenamelinknamere   r^   tiprc   r\   r   r   badlinkn   s   z)TarExtractor.safemembers.<locals>.badlinkExtraction of  is blocked (illegal path) is blocked: Symlink to z is blocked: Hard link to N)r   r   r0   r1   rU   TarInforh   loggererrorissymri   islnkrW   r"   rm   r^   finfor   rl   r   safemembers[   s   zTarExtractor.safemembersr(   Nc                 C   s:   t j|dd t| }|j|t||d |  d S )NTexist_okrW   )r   makedirsrU   r?   
extractallrT   rx   close)r(   r"   tar_filer   r   r   r*      s   
zTarExtractor.extract)r-   r.   r/   r9   r   r   r0   r1   r7   r:   rU   TarFilerx   r*   r   r   r   r   rT   V   s    #,rT   c                   @   <   e Zd ZdgZedeeef deeef ddfddZdS )GzipExtractors   r(   r"   r   Nc              	   C   x   t | d,}t|d}t|| W d    n1 sw   Y  W d    d S W d    d S 1 s5w   Y  d S Nr>   wb)gzipr?   shutilcopyfileobj)r(   r"   	gzip_fileextracted_filer   r   r   r*         "zGzipExtractor.extract	r-   r.   r/   r<   r:   r   r   r0   r*   r   r   r   r   r          ,r   c                       s   e Zd Zg dZeddeeef dede	f fddZ
edeej d	eeef fd
dZedeeef d	eeef ddfddZ  ZS )ZipExtractor)s   PKs   PKs   PKrC   r   rD   r   c                    s  t  j||dr
dS zddlm}m}m}m}m}m}m	}	m
}
m}m} t|d}|	|}|r|| dkrK|| dkrK|| dkrK	 W d    W dS || || kr|||  | || kr|| |
kr||
}t||
krt||}|| |kr	 W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS 1 sw   Y  W dS  ty   Y dS w )NrK   Tr   )
_CD_SIGNATURE_ECD_DISK_NUMBER_ECD_DISK_START_ECD_ENTRIES_TOTAL_ECD_OFFSET	_ECD_SIZE_EndRecDatasizeCentralDirstringCentralDirstructCentralDirr>   F)superr7   zipfiler   r   r   r   r   r   r   r   r   r   r?   seektellr@   rE   structunpack	Exception)r5   r   rD   r   r   r   r   r   r   r   r   r   r   fpendrecdatacentdir	__class__r   r   r7      sT   0$






zZipExtractor.is_extractablerW   r"   c                 #   st    dt ttf dtfdd dtdtdtf fdd} |}| D ]}||j|r4td|j d	 q!|V  q!d
S )a,  
        Fix for CVE-2007-4559
        Desc:
            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
        From: https://stackoverflow.com/a/10077309

        This additional mitigation is applied for zipfile as well.
        r   r   c                 S   rX   r   rY   r[   r   r   r   r\      r]   z*ZipExtractor.safemembers.<locals>.resolvedr^   c                    r_   r   r`   ra   rb   r   r   rc      rd   z)ZipExtractor.safemembers.<locals>.badpathrn   ro   Nr   r   r0   r1   filenamerr   rs   )rW   r"   rc   r^   rw   r   rb   r   rx      s   zZipExtractor.safemembersr(   Nc                 C   s`   t j|dd t| d}|j|t|j|d |  W d    d S 1 s)w   Y  d S )NTry   rr{   )	r   r|   r   ZipFiler}   r   rx   filelistr~   )r(   r"   zip_filer   r   r   r*      s
   
"zZipExtractor.extractrO   )r-   r.   r/   r<   r9   r   r   r0   rQ   r1   r7   r:   rP   r   ZipInforx   r*   __classcell__r   r   r   r   r      s    &$"0r   c                   @   r   )XzExtractors   7zXZ r(   r"   r   Nc              	   C   sv   t | ,}t|d}t|| W d    n1 sw   Y  W d    d S W d    d S 1 s4w   Y  d S )Nr   )lzmar?   r   r   r(   r"   compressed_filer   r   r   r   r*      s   "zXzExtractor.extractr   r   r   r   r   r      r   r   c                   @   s`   e Zd ZddgZeded deeef fddZ	edeeef deeef d	d
fddZ
d
S )RarExtractors   Rar! s   Rar! rW   rarfile.RarInfor"   c                 #   s    dt ttf dtfdddtdtdtffdd dd	dtdtf fd
d}|}| D ]-} |j|rBtd|j d q/| rY|||rYtd|j d|j  q/|V  q/dS )a,  
        Fix for CVE-2007-4559
        Desc:
            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
        From: https://stackoverflow.com/a/10077309

        This additional mitigation is applied for rarfile as well.
        r   r   c                 S   rX   r   rY   r[   r   r   r   r\      r]   z*RarExtractor.safemembers.<locals>.resolvedr^   c                    r_   r   r`   ra   rb   r   r   rc     rd   z)RarExtractor.safemembers.<locals>.badpathre   r   c                    s4   t j|t j| j}| j\}}} ||dS rf   )r   r   r   rg   r   
file_redir)re   r^   rk   
redir_typeredir_flags	link_namerl   r   r   rm     s   z)RarExtractor.safemembers.<locals>.badlinkrn   ro   rp   N)	r   r   r0   r1   r   rr   rs   
is_symlinkr   rv   r   rl   r   rx      s   zRarExtractor.safemembersr(   r   Nc                 C   sT   t jstddd l}tj|dd || }|j|t	|
 |d |  d S )NzPlease pip install rarfiler   Try   r{   )r	   RARFILE_AVAILABLEImportErrorrarfiler   r|   RarFiler}   r   rx   infolistr~   )r(   r"   r   rfr   r   r   r*     s   
zRarExtractor.extractr-   r.   r/   r<   r:   rP   r   r   r0   rx   r*   r   r   r   r   r      s     $,r   c                   @   r   )ZstdExtractors   (/r(   r"   r   Nc              	   C   s   t jstddd l}| }t| d,}t|d}||| W d    n1 s+w   Y  W d    d S W d    d S 1 sCw   Y  d S )NzPlease pip install zstandardr   r>   r   )r	   ZSTANDARD_AVAILABLEr   	zstandardZstdDecompressorr?   copy_stream)r(   r"   zstddctxifhofhr   r   r   r*   &  s   PzZstdExtractor.extractr   r   r   r   r   r   #  r   r   c                   @   r   )Bzip2Extractors   BZhr(   r"   r   Nc              	   C   r   r   )bz2r?   r   r   r   r   r   r   r*   4  r   zBzip2Extractor.extractr   r   r   r   r   r   1  r   r   c                   @   s^   e Zd ZdgZeded deeef fddZ	edeeef deeef dd	fd
dZ
d	S )SevenZipExtractors   7z'rW   py7zr.FileInfor"   c                 #   s    dt ttf dtfdddtdtdtffdd dd	dtdtf fd
d}|}| D ]} |j|rBtd|j d q/|V  q/dS )a*  
        Fix for CVE-2007-4559
        Desc:
            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
        From: https://stackoverflow.com/a/10077309

        This additional mitigation is applied for py7zr as well.
        r   r   c                 S   rX   r   rY   r[   r   r   r   r\   L  r]   z/SevenZipExtractor.safemembers.<locals>.resolvedr^   c                    r_   r   r`   ra   rb   r   r   rc   O  rd   z.SevenZipExtractor.safemembers.<locals>.badpathre   r   c                    s2   t j|t j| j} t j| j|dS rf   )r   r   r   rg   r   basenamerj   rl   r   r   rm   S  s   z.SevenZipExtractor.safemembers.<locals>.badlinkrn   ro   Nr   rv   r   rl   r   rx   >  s   zSevenZipExtractor.safemembersr(   r   Nc                 C   s~   t jstddd l}tj|dd || d}dd t|	 |D }|j
||d W d    d S 1 s8w   Y  d S )	NzPlease pip install py7zrr   Try   r   c                 S   s   g | ]}|j qS r   )r   )rG   rw   r   r   r   
<listcomp>k  s    z-SevenZipExtractor.extract.<locals>.<listcomp>)targets)r	   PY7ZR_AVAILABLEr   py7zrr   r|   SevenZipFiler   rx   rP   r*   )r(   r"   r   archiver   r   r   r   r*   c  s   "zSevenZipExtractor.extractr   r   r   r   r   r   ;  s     $,r   c                   @   r   )Lz4Extractors   "Mr(   r"   r   Nc              	   C   s   t jstddd l}|j| d,}t|d}t|| W d    n1 s)w   Y  W d    d S W d    d S 1 sAw   Y  d S )NzPlease pip install lz4r   r>   r   )r	   LZ4_AVAILABLEr   	lz4.frameframer?   r   r   )r(   r"   lz4r   r   r   r   r   r*   r  s   "zLz4Extractor.extractr   r   r   r   r   r   o  r   r   c                
   @   s   e Zd ZU eeeeeee	e
ed	Zeeee f ed< edd Zedeeef defddZeddeeef d
edefddZedeeef dee fddZedeeef deeef deddfddZdS )r   )	tarr   zipxzrarr   r   7zr   
extractorsc                 C   s   t dd | j D S )Nc                 s   s.    | ]}t |tr|jD ]}t|V  qqd S r   )
issubclassr;   r<   rE   )rG   r   extractor_magic_numberr   r   r   rI     s    z9Extractor._get_magic_number_max_length.<locals>.<genexpr>)rL   r   values)r5   r   r   r   _get_magic_number_max_length  s   z&Extractor._get_magic_number_max_lengthr   r=   c                 C   s&   zt j| |dW S  ty   Y dS w )N)r=   rC   )r;   rB   rM   )r   r=   r   r   r   _read_magic_number  s
   zExtractor._read_magic_numberFreturn_extractorr   c                 C   s>   t jdtd | |}|r|sdS d| j| fS |sdS dS )Nz{Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'infer_extractor_format' instead.)categoryTF)FN)warningswarnFutureWarningr)   r   )r5   r   r   r+   r   r   r   r7     s   
zExtractor.is_extractablec                 C   sB   |   }| ||}| j D ]\}}|j||dr|  S qd S )NrK   )r   r   r   itemsr7   )r5   r   magic_number_max_lengthrD   r+   r   r   r   r   r)     s   z Extractor.infer_extractor_formatr(   r"   r+   Nc                 C   sx   t jt j|dd tt|d}t| tj	|dd | j
| }|||W  d    S 1 s5w   Y  d S )NTry   z.lock)ignore_errors)r   r|   r   rg   r0   r   with_suffixr   r   rmtreer   r*   )r5   r(   r"   r+   	lock_pathr   r   r   r   r*     s   


$zExtractor.extractr,   )r-   r.   r/   rT   r   r   r   r   r   r   r   r   r   dictr0   typer2   rR   r9   r   r:   r   r   rS   r   r1   r7   r   r)   r*   r   r   r   r   r   }  s:   
 
" 

r   ))r   r   r   r   r   r   rU   r   r   abcr   r   pathlibr   typingr   r   r    r	   	_filelockr   loggingr   r   r   r-   rr   r   r2   r;   rT   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s@    
1
T
4
4