o
    z3i/                     @   s   d dl Z d dlZd dlmZmZmZ d dlZd dlmZ	 ddl
mZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddl m!Z! G dd de!Z"G dd dZ#dS )    N)BinaryIOOptionalUnion   )DatasetFeatures
NamedSplitconfig)$get_writer_batch_size_from_data_size#get_writer_batch_size_from_featuresrequire_storage_embed)query_table)_PACKAGED_DATASETS_MODULES)Parquet)tqdm)NestedDataStructureLikePathLike   )AbstractDatasetReaderc                       s`   e Zd Z						ddee dee dee dede	de	d	ee
 f fd
dZdd Z  ZS )ParquetDatasetReaderNFpath_or_pathssplitfeatures	cache_dirkeep_in_memory	streamingnum_procc           
   	      sd   t  j|f||||||d| t|tr|n| j|i}td d }	td||||	d|| _d S )N)r   r   r   r   r   r   parquetr   )r   
data_filesr   hash )super__init__
isinstancedictr   r   r   builder)
selfr   r   r   r   r   r   r   kwargsr    	__class__r!   a/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/datasets/io/parquet.pyr#      s,   
zParquetDatasetReader.__init__c                 C   s\   | j r| jj| jd}|S d }d }d }d }| jj||||| jd | jj| j|| jd}|S )N)r   )download_configdownload_modeverification_mode	base_pathr   )r   r.   	in_memory)r   r&   as_streaming_datasetr   download_and_preparer   
as_datasetr   )r'   datasetr,   r-   r.   r/   r!   r!   r+   read3   s$   
zParquetDatasetReader.read)NNNFFN)__name__
__module____qualname__r   r   r   r   r   strboolintr#   r5   __classcell__r!   r!   r)   r+   r      s.    r   c                   @   st   e Zd Z				ddedeeef dee dee	 dee
e	f de
fd	d
ZdefddZdededefddZdS )ParquetDatasetWriterNTr4   path_or_buf
batch_sizestorage_optionsuse_content_defined_chunkingwrite_page_indexc                 K   s\   || _ || _|pt|jptt|| | _|pi | _|| _	|du r&t
j}|| _|| _d S )NT)r4   r>   r   r   r
   len_estimate_nbytesr?   r@   parquet_writer_kwargsr	   DEFAULT_CDC_OPTIONSrA   rB   )r'   r4   r>   r?   r@   rA   rB   rE   r!   r!   r+   r#   L   s   


zParquetDatasetWriter.__init__returnc                 C   s   t | jtttjfr8tj| jdfi | jpi }| j	d|| j
d| j}W d    |S 1 s1w   Y  |S | j	d| j| j
d| j}|S )Nwb)file_objr?   r!   )r$   r>   r9   bytesosr   fsspecopenr@   _writer?   rE   )r'   bufferwrittenr!   r!   r+   writed   s&   
zParquetDatasetWriter.writerI   c           
   
   K   s   d}| dd}| jjj}tj|f|| j| jdd | jj D dd | jj D dd | jj D d	|}t	t
dt| j|d
ddD ]}t| jjt||| | jjd}	||	 ||	j7 }qH| jduru|dt| ji |  |S )zWrites the pyarrow table as Parquet to a binary file handle.

        Caller is responsible for opening and closing the handle.
        r   r>   Nc                 S   s"   i | ]\}}|t |rd ndqS )nonesnappyr   .0colfeaturer!   r!   r+   
<dictcomp>   s    z/ParquetDatasetWriter._write.<locals>.<dictcomp>c                 S   s   g | ]
\}}t |s|qS r!   r   rT   r!   r!   r+   
<listcomp>   s
    z/ParquetDatasetWriter._write.<locals>.<listcomp>c                 S   s   i | ]\}}t |r|d qS )PLAINr   rT   r!   r!   r+   rX      s
    )schemarA   rB   compressionuse_dictionarycolumn_encodingbaz"Creating parquet from Arrow format)unitdesc)tablekeyindicesFcontent_defined_chunking)popr4   r   arrow_schemapqParquetWriterrA   rB   itemshf_tqdmrangerC   r   _dataslice_indiceswrite_tablenbytesadd_key_value_metadatajsondumpsclose)
r'   rI   r?   rE   rP   _r[   writeroffsetbatchr!   r!   r+   rN   t   sJ   






zParquetDatasetWriter._write)NNTT)r6   r7   r8   r   r   r   r   r   r;   r%   r:   r#   rQ   rN   r!   r!   r!   r+   r=   K   s(    


r=   )$rs   rK   typingr   r   r   rL   pyarrow.parquetr   rh    r   r   r   r	   arrow_writerr
   r   features.featuresr   
formattingr   packaged_modulesr    packaged_modules.parquet.parquetr   utilsr   rk   utils.typingr   r   abcr   r   r=   r!   r!   r!   r+   <module>   s     8