
    #j                         d dl Z d dlZd dlmZmZmZ d dlZd dlmZ	 ddl
mZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddl m!Z!  G d de!          Z" G d d          Z#dS )    N)BinaryIOOptionalUnion   )DatasetFeatures
NamedSplitconfig)$get_writer_batch_size_from_data_size#get_writer_batch_size_from_featuresrequire_storage_embed)query_table)_PACKAGED_DATASETS_MODULES)Parquet)tqdm)NestedDataStructureLikePathLike   )AbstractDatasetReaderc                        e Zd Z	 	 	 	 	 	 ddee         dee         dee         dede	de	d	ee
         f fd
Zd Z xZS )ParquetDatasetReaderNFpath_or_pathssplitfeatures	cache_dirkeep_in_memory	streamingnum_procc           
           t                      j        |f||||||d| t          |t                    r|n| j        |i}t
          d         d         }	t          d||||	d|| _        d S )N)r   r   r   r   r   r   parquetr   )r   
data_filesr   hash )super__init__
isinstancedictr   r   r   builder)selfr   r   r   r   r   r   r   kwargsr#   	__class__s             ]/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/datasets/io/parquet.pyr&   zParquetDatasetReader.__init__   s     			
)		
 		
 		
 		
 		
 *4M4)H)Hitz[hNi))4Q7 
$	
 

 
 
    c                     | j         r!| j                            | j                  }nRd }d }d }d }| j                            ||||| j                   | j                            | j        | j                  }|S )N)r   )download_configdownload_modeverification_mode	base_pathr   )r   	in_memory)r   r)   as_streaming_datasetr   download_and_preparer   
as_datasetr   )r*   datasetr0   r1   r2   r3   s         r-   readzParquetDatasetReader.read3   s    > 	_l77dj7IIGG #O M $IL-- /+"3# .    l--DJ$J]-^^Gr.   )NNNFFN)__name__
__module____qualname__r   r   r   r	   r   strboolintr&   r9   __classcell__)r,   s   @r-   r   r      s         '+'+$"&
 
.x8
 
#
 8$	

 
 
 
 3-
 
 
 
 
 
>      r.   r   c                       e Zd Z	 	 	 	 ddedeeef         dee         dee	         dee
e	f         de
fd	Zd
efdZdeded
efdZdS )ParquetDatasetWriterNTr8   path_or_buf
batch_sizestorage_optionsuse_content_defined_chunkingwrite_page_indexc                    || _         || _        |pBt          |j                  p.t	          t          |          |                                          | _        |pi | _        || _	        |du rt          j        }|| _        || _        d S )NT)r8   rC   r   r   r   len_estimate_nbytesrD   rE   parquet_writer_kwargsr
   DEFAULT_CDC_OPTIONSrF   rG   )r*   r8   rC   rD   rE   rF   rG   rK   s           r-   r&   zParquetDatasetWriter.__init__J   s     & ^273CDD^3CLL'BZBZB\B\]] 	
  /4"%:"'4//+1+E(,H) 0r.   returnc                 @   t          | j        t          t          t          j        f          rRt          j        | j        dfi | j        pi 5 } | j	        d|| j
        d| j        }d d d            n# 1 swxY w Y   n | j	        d| j        | j
        d| j        }|S )Nwb)file_objrD   r$   )r'   rC   r=   bytesosr   fsspecopenrE   _writerD   rK   )r*   bufferwrittens      r-   writezParquetDatasetWriter.writeb   s
   d&eR[(ABB 	T-tTT8L8RPRTT X^%$+ ##  0                "dk )?  , G
 s   A33A7:A7rP   c                 0   d}|                     dd          }| j        j        j        }t	          j        |f|| j        | j        d | j        j                                        D             d | j        j                                        D             d | j        j                                        D             d|}t          t          dt          | j                  |          dd	
          D ]X}t          | j        j        t          |||z             | j        j                  }	|                    |	           ||	j        z  }Y| j        dur.|                    dt'          j        | j                  i           |                                 |S )zWrites the pyarrow table as Parquet to a binary file handle.

        Caller is responsible for opening and closing the handle.
        r   rC   Nc                 <    i | ]\  }}|t          |          rd ndS )nonesnappyr   .0colfeatures      r-   
<dictcomp>z/ParquetDatasetWriter._write.<locals>.<dictcomp>   s@        C 4W==KVV8  r.   c                 6    g | ]\  }}t          |          |S r$   r   r]   s      r-   
<listcomp>z/ParquetDatasetWriter._write.<locals>.<listcomp>   s;       $WMbcjMkMk  r.   c                 8    i | ]\  }}t          |          |d S )PLAINr   r]   s      r-   ra   z/ParquetDatasetWriter._write.<locals>.<dictcomp>   s=       !-gRghoRpRpW  r.   )schemarF   rG   compressionuse_dictionarycolumn_encodingbaz"Creating parquet from Arrow format)unitdesc)tablekeyindicesFcontent_defined_chunking)popr8   r   arrow_schemapqParquetWriterrF   rG   itemshf_tqdmrangerI   r   _dataslice_indiceswrite_tablenbytesadd_key_value_metadatajsondumpsclose)
r*   rP   rD   rK   rW   _rf   writeroffsetbatchs
             r-   rU   zParquetDatasetWriter._writer   s   
 !%%mT::&3!
)-)J!2 $(L$9$?$?$A$A   (,(=(C(C(E(E   151F1L1L1N1N  
 
 $
 
$ !S&&
335
 
 
 	$ 	$F
  l(&&:"566-  E
 u%%%u|#GG ,E99))+EtzRVRsGtGt*uvvvr.   )NNTT)r:   r;   r<   r   r   r   r   r   r?   r(   r>   r&   rX   rU   r$   r.   r-   rB   rB   I   s        
 %)*.:>!%1 11 8X-.1 SM	1
 "$1 ',D$J&71 1 1 1 10s     -x -S -VY - - - - - -r.   rB   )$r~   rR   typingr   r   r   rS   pyarrow.parquetr!   rs    r   r   r	   r
   arrow_writerr   r   features.featuresr   
formattingr   packaged_modulesr    packaged_modules.parquet.parquetr   utilsr   rv   utils.typingr   r   abcr   r   rB   r$   r.   r-   <module>r      sx    				 , , , , , , , , , ,        4 4 4 4 4 4 4 4 4 4 4 4 d d d d d d d d 5 5 5 5 5 5 $ $ $ $ $ $ 9 9 9 9 9 9 6 6 6 6 6 6 # # # # # # < < < < < < < < & & & & & &3 3 3 3 30 3 3 3lV V V V V V V V V Vr.   