o
    z3i+                     @   s   d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ e	rZd dlZddlmZ ddde fddZ!eG dd dZ"ddde#fddZ$dS )    N)	dataclassfield)BytesIO)Path)TYPE_CHECKINGAnyClassVarDictOptionalUnion   )config)DownloadConfig)
array_cast)is_local_pathxopen)no_op_if_value_is_nullstring_to_dict   )FeatureTypepdfpdfplumber.pdf.PDFreturnc                 C   sL   t  }| jD ]	}||jj q| W  d   S 1 sw   Y  dS )z-Convert a pdfplumber.pdf.PDF object to bytes.N)r   pageswriter   streamgetvalue)r   bufferpage r   c/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/datasets/features/pdf.pypdf_to_bytes   s
   
$r!   c                   @   s  e Zd ZU dZdZeed< edddZe	e
 ed< dZee
 ed	< ee e d
Zee ed< ed dddZe
ed< dd Zdee
eeedf defddZddeddfddZdedee
df f fddZdeejejejf dejfddZ ddejdejfddZ!dS )Pdfa  
    **Experimental.**
    Pdf [`Feature`] to read pdf documents from a pdf file.

    Input: The Pdf feature accepts as input:
    - A `str`: Absolute path to the pdf file (i.e. random access is allowed).
    - A `pathlib.Path`: path to the pdf file (i.e. random access is allowed).
    - A `dict` with the keys:
        - `path`: String with relative path of the pdf file in a dataset repository.
        - `bytes`: Bytes of the pdf file.
      This is useful for archived files with sequential access.

    - A `pdfplumber.pdf.PDF`: pdfplumber pdf object.

    Args:
        decode (`bool`, defaults to `True`):
            Whether to decode the pdf data. If `False`,
            returns the underlying dictionary in the format `{"path": pdf_path, "bytes": pdf_bytes}`.

    Examples:

    ```py
    >>> from datasets import Dataset, Pdf
    >>> ds = Dataset.from_dict({"pdf": ["path/to/pdf/file.pdf"]}).cast_column("pdf", Pdf())
    >>> ds.features["pdf"]
    Pdf(decode=True, id=None)
    >>> ds[0]["pdf"]
    <pdfplumber.pdf.PDF object at 0x7f8a1c2d8f40>
    >>> ds = ds.cast_column("pdf", Pdf(decode=False))
    >>> ds[0]["pdf"]
    {'bytes': None,
    'path': 'path/to/pdf/file.pdf'}
    ```
    TdecodeNF)defaultrepridr   dtypebytespathpa_type)r$   initr%   _typec                 C   s   | j S N)r+   )selfr   r   r    __call__K   s   zPdf.__call__valuer   c                 C   s   t jrddl}nd}t|tr|ddS t|tr"t| ddS t|ttfr.d|dS |dur=t||j	j
r=t|S |ddurTtj|d rTd|ddS |ddusb|ddurm|d|ddS td| d)	zEncode example into a format for Arrow.

        Args:
            value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`):
                Data passed as input to Pdf feature.

        Returns:
            `dict` with "path" and "bytes" fields
        r   Nr*   r)   r*   r(   r)   zRA pdf sample should have one of 'path' or 'bytes' but they are missing or None in .)r   PDFPLUMBER_AVAILABLE
pdfplumber
isinstancestrr   absoluter)   	bytearrayr   PDFencode_pdfplumber_pdfgetosr*   isfile
ValueError)r/   r1   r5   r   r   r    encode_exampleN   s$   






zPdf.encode_examplec                 C   s.  | j stdtjrddl}ntd|du ri }|d |d }}|du r{|du r2td| dt|r=||}|S |	d	d
 }|
tjrMtjntj}zt||d }	||	}
W n tyi   d}
Y nw t|
d}t|d|d}||S |t|}|}W d   |S 1 sw   Y  |S )ai  Decode example pdf file into pdf data.

        Args:
            value (`str` or `dict`):
                A string with the absolute pdf file path, a dictionary with
                keys:

                - `path`: String with absolute or relative pdf file path.
                - `bytes`: The bytes of the pdf file.

            token_per_repo_id (`dict`, *optional*):
                To access and decode pdf files from private repositories on
                the Hub, you can pass a dictionary
                repo_id (`str`) -> token (`bool` or `str`).

        Returns:
            `pdfplumber.pdf.PDF`
        zKDecoding is disabled for this feature. Please use Pdf(decode=True) instead.r   Nz6To support decoding pdfs, please install 'pdfplumber'.r*   r)   z@A pdf should have one of 'path' or 'bytes' but both are None in r3   ::repo_idtokenrbdownload_config)r#   RuntimeErrorr   r4   r5   ImportErrorr?   r   opensplit
startswithHF_ENDPOINTHUB_DATASETS_URLHUB_DATASETS_HFFS_URLr   r<   r   r   r   )r/   r1   token_per_repo_idr5   r*   bytes_r   
source_urlpatternrC   rE   rH   fpr   r   r    decode_exampleq   sD   





zPdf.decode_exampler   c                 C   s(   ddl m} | jr| S |d|ddS )zfIf in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.r   )Valuebinarystringr(   )featuresrX   r#   )r/   rX   r   r   r    flatten   s   zPdf.flattenstoragec                 C   s<  t j|jr%t jdgt| t  d}t jj||gddg|	 d}nst j
|jrJt jdgt| t  d}t jj||gddg|	 d}nNt j|jr|jddkr_|d}nt jdgt| t  d}|jddkr{|d}nt jdgt| t  d}t jj||gddg|	 d}t|| jS )a  Cast an Arrow array to the Pdf arrow storage type.
        The Arrow types that can be converted to the Pdf pyarrow storage type are:

        - `pa.string()` - it must contain the "path" data
        - `pa.binary()` - it must contain the image bytes
        - `pa.struct({"bytes": pa.binary()})`
        - `pa.struct({"path": pa.string()})`
        - `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter
        - `pa.list(*)` - it must contain the pdf array data

        Args:
            storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`):
                PyArrow array to cast.

        Returns:
            `pa.StructArray`: Array in the Pdf arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
        Ntyper)   r*   maskr   )patypes	is_stringr_   arraylenrY   StructArrayfrom_arraysis_null	is_binaryrZ   	is_structget_field_indexr   r   r+   )r/   r]   bytes_array
path_arrayr   r   r    cast_storage   s     zPdf.cast_storagec                    s   du ri t fdd tj fdd| D t d}tjdd |d D t d}tjj||gd	dg|	 d
}t
|| jS )a4  Embed PDF files into the Arrow array.

        Args:
            storage (`pa.StructArray`):
                PyArrow array to embed.

        Returns:
            `pa.StructArray`: Array in the PDF arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
        Nc                    s   |  dd }|tjrtjntj}t||}|d ur# |d nd }t|d}t	| d|d}|
 W  d    S 1 s@w   Y  d S )NrA   rB   rC   rD   rF   rG   )rL   rM   r   rN   rO   rP   r   r<   r   r   read)r*   rS   rT   source_url_fieldsrE   rH   rU   )rQ   r   r    path_to_bytes   s   

$z(Pdf.embed_storage.<locals>.path_to_bytesc                    s8   g | ]}|d ur|d d u r |d n|d nd qS )Nr)   r*   r   ).0x)rr   r   r    
<listcomp>   s    *z%Pdf.embed_storage.<locals>.<listcomp>r^   c                 S   s$   g | ]}|d urt j|nd qS r.   )r=   r*   basename)rs   r*   r   r   r    ru      s   $ r*   r)   r`   )r   rb   re   	to_pylistrY   r   rZ   rg   rh   ri   r   r+   )r/   r]   rQ   rm   rn   r   )rr   rQ   r    embed_storage   s    
zPdf.embed_storager.   )"__name__
__module____qualname____doc__r#   bool__annotations__r   r&   r
   r7   r'   r   rb   structrY   rZ   r+   r   r-   r0   r   r)   r9   dictr@   rW   r	   r\   StringArrayrg   	ListArrayro   rx   r   r   r   r    r"      s   
 #$ #:$%r"   c                 C   s:   t | drt | jdr| jjr| jjddS dt| dS )aA  
    Encode a pdfplumber.pdf.PDF object into a dictionary.

    If the PDF has an associated file path, returns the path. Otherwise, serializes
    the PDF content into bytes.

    Args:
        pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.

    Returns:
        dict: A dictionary with "path" or "bytes" field.
    r   nameNr2   )hasattrr   r   r!   )r   r   r   r    r;     s   r;   )%r=   dataclassesr   r   ior   pathlibr   typingr   r   r   r	   r
   r   pyarrowrb    r   download.download_configr   tabler   utils.file_utilsr   r   utils.py_utilsr   r   r5   r[   r   r)   r!   r"   r   r;   r   r   r   r    <module>   s&      h