
    #j/                        d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ e	r
d dlZddlm Z  ddde!fdZ"e G d d                      Z#ddde$fdZ%dS )    N)	dataclassfield)BytesIO)Path)TYPE_CHECKINGAnyClassVarDictOptionalUnion   )config)DownloadConfig)
array_cast)is_local_pathis_remote_urlxopen)no_op_if_value_is_nullstring_to_dict   )FeatureTypepdfpdfplumber.pdf.PDFreturnc                     t                      5 }| j        D ]!}|                    |j        j                   "|                                cddd           S # 1 swxY w Y   dS )z-Convert a pdfplumber.pdf.PDF object to bytes.N)r   pageswriter   streamgetvalue)r   bufferpages      _/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/datasets/features/pdf.pypdf_to_bytesr#      s    	 !fI 	* 	*DLL))))  ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !s   =AA Ac            	          e Zd ZU dZdZeed<    edd          Ze	e
         ed<   dZee
         ed	<    ej         ej                     ej                    d
          Zee         ed<    ed dd          Ze
ed<   d Zdee
eeedf         defdZddeddfdZdedee
df         f         fdZdeej        ej        ej        f         dej        fdZ 	 ddej        dededej        fdZ!dS )Pdfa  
    **Experimental.**
    Pdf [`Feature`] to read pdf documents from a pdf file.

    Input: The Pdf feature accepts as input:
    - A `str`: Absolute path to the pdf file (i.e. random access is allowed).
    - A `pathlib.Path`: path to the pdf file (i.e. random access is allowed).
    - A `dict` with the keys:
        - `path`: String with relative path of the pdf file in a dataset repository.
        - `bytes`: Bytes of the pdf file.
      This is useful for archived files with sequential access.

    - A `pdfplumber.pdf.PDF`: pdfplumber pdf object.

    Args:
        decode (`bool`, defaults to `True`):
            Whether to decode the pdf data. If `False`,
            returns the underlying dictionary in the format `{"path": pdf_path, "bytes": pdf_bytes}`.

    Examples:

    ```py
    >>> from datasets import Dataset, Pdf
    >>> ds = Dataset.from_dict({"pdf": ["path/to/pdf/file.pdf"]}).cast_column("pdf", Pdf())
    >>> ds.features["pdf"]
    Pdf(decode=True, id=None)
    >>> ds[0]["pdf"]
    <pdfplumber.pdf.PDF object at 0x7f8a1c2d8f40>
    >>> ds = ds.cast_column("pdf", Pdf(decode=False))
    >>> ds[0]["pdf"]
    {'bytes': None,
    'path': 'path/to/pdf/file.pdf'}
    ```
    TdecodeNF)defaultrepridr   dtypebytespathpa_type)r'   initr(   _typec                     | j         S N)r.   )selfs    r"   __call__zPdf.__call__K   s
    |    valuer   c                    t           j        rddl}nd}t          |t                    r|ddS t          |t
                    r$t	          |                                          ddS t          |t          t          f          rd|dS |)t          ||j	        j
                  rt          |          S |                    d          =t          j                            |d                   rd|                    d          dS |                    d          |                    d          +|                    d          |                    d          dS t!          d| d          )	zEncode example into a format for Arrow.

        Args:
            value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`):
                Data passed as input to Pdf feature.

        Returns:
            `dict` with "path" and "bytes" fields
        r   Nr-   r,   r-   r+   r,   zRA pdf sample should have one of 'path' or 'bytes' but they are missing or None in .)r   PDFPLUMBER_AVAILABLE
pdfplumber
isinstancestrr   absoluter,   	bytearrayr   PDFencode_pdfplumber_pdfgetosr-   isfile
ValueError)r3   r6   r;   s      r"   encode_examplezPdf.encode_exampleN   so    & 	JeS!! 	!D111t$$ 	 0 011DAAAy122 	 5111#
5*.:L(M(M#(///YYv*rw~~eFm/L/L*!599V+<+<===YYw+uyy/@/@/L"YYw//69J9JKKKmejmmm  r5   c                    | j         st          d          t          j        rddl}nt          d          |i }|d         |d         }}||t          d| d          t          |          r |j        |          }n|	                    d	          d
         }|
                    t          j                  rt          j        nt          j        }	 t          ||          d         }	|                    |	          }
n# t          $ r d}
Y nw xY wt!          |
          }t#          |d|          } |j        |          S  |j        t%          |                    5 }|}ddd           n# 1 swxY w Y   |S )ai  Decode example pdf file into pdf data.

        Args:
            value (`str` or `dict`):
                A string with the absolute pdf file path, a dictionary with
                keys:

                - `path`: String with absolute or relative pdf file path.
                - `bytes`: The bytes of the pdf file.

            token_per_repo_id (`dict`, *optional*):
                To access and decode pdf files from private repositories on
                the Hub, you can pass a dictionary
                repo_id (`str`) -> token (`bool` or `str`).

        Returns:
            `pdfplumber.pdf.PDF`
        zKDecoding is disabled for this feature. Please use Pdf(decode=True) instead.r   Nz6To support decoding pdfs, please install 'pdfplumber'.r-   r,   z@A pdf should have one of 'path' or 'bytes' but both are None in r9   ::repo_idtokenrbdownload_config)r&   RuntimeErrorr   r:   r;   ImportErrorrE   r   opensplit
startswithHF_ENDPOINTHUB_DATASETS_URLHUB_DATASETS_HFFS_URLr   rB   r   r   r   )r3   r6   token_per_repo_idr;   r-   bytes_r   
source_urlpatternrJ   rL   rO   fps                 r"   decode_examplezPdf.decode_exampleq   s   & { 	nlmmm& 	XVWWW$ "V}eGnf>| !ldi!l!l!lmmm && .)*/$//CC!%D!1!1"!5J &001CDD://#9 
%"0W"E"Ei"P 1 5 5g > >% % % % $%&45&A&A&AOdD/JJJA*:?1--- 11 Q               
s$   +D DD"E11E58E5r   c                 N    ddl m} | j        r| n |d           |d          dS )zfIf in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.r   )Valuebinarystringr+   )featuresr`   r&   )r3   r`   s     r"   flattenzPdf.flatten   sK    ###### {DD xh 	
r5   storagec                    t           j                            |j                  rrt          j        dgt          |          z  t          j                              }t           j                            ||gddg|	                                          }nt           j        
                    |j                  rrt          j        dgt          |          z  t          j                              }t           j                            ||gddg|	                                          }n5t           j                            |j                  r|j                            d          dk    r|                    d          }n8t          j        dgt          |          z  t          j                              }|j                            d          dk    r|                    d          }n8t          j        dgt          |          z  t          j                              }t           j                            ||gddg|	                                          }t          || j                  S )a  Cast an Arrow array to the Pdf arrow storage type.
        The Arrow types that can be converted to the Pdf pyarrow storage type are:

        - `pa.string()` - it must contain the "path" data
        - `pa.binary()` - it must contain the image bytes
        - `pa.struct({"bytes": pa.binary()})`
        - `pa.struct({"path": pa.string()})`
        - `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter
        - `pa.list(*)` - it must contain the pdf array data

        Args:
            storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`):
                PyArrow array to cast.

        Returns:
            `pa.StructArray`: Array in the Pdf arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
        Ntyper,   r-   maskr   )patypes	is_stringrh   arraylenra   StructArrayfrom_arraysis_null	is_binaryrb   	is_structget_field_indexr   r   r.   )r3   re   bytes_array
path_arrays       r"   cast_storagezPdf.cast_storage   s   & 8gl++ 	w(D6CLL#8ry{{KKKKn00+w1G'SYIZahapaparar0ssGGX-- 	w4&3w<<"7bikkJJJJn00':1FRXHY`g`o`o`q`q0rrGGX-- 		w|++G4499%mmG44 hvG'<29;;OOO|++F33q88$]]622

Xtfs7||&;")++NNN
n00+z1JWV\L]dkdsdsdudu0vvG'4<000r5   local_filesremote_filesc                    i t           fd            t          j        fd|                                D             t          j                              }t          j        fd|                    d                                          D             t          j                              }t          j                            ||gddg|	                                          }t          || j                  S )	a3  Embed PDF files into the Arrow array.

        Args:
            storage (`pa.StructArray`):
                PyArrow array to embed.
            token_per_repo_id (`dict`, optional):
                Dictionary repo_id -> token to fetch the files bytes.
            local_files (`bool`, defaults to `True`)
                Whether to embed local files data in the array

                <Added version="4.8.5"/>
            remote_files (`bool`, defaults to `True`)
                Whether to embed remote files data in the array.
                E.g. files with paths that start with hf:// or https://

                <Added version="4.8.5"/>

        Returns:
            `pa.StructArray`: Array in the PDF arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
        Nc                    |                      d          d         }|                    t          j                  rt          j        nt          j        }t          ||          }|                    |d                   nd }t          |          }t          | d|          5 }|
                                cd d d            S # 1 swxY w Y   d S )NrH   rI   rJ   rK   rM   rN   )rS   rT   r   rU   rV   rW   r   rB   r   r   read)r-   rZ   r[   source_url_fieldsrL   rO   r\   rX   s          r"   path_to_bytesz(Pdf.embed_storage.<locals>.path_to_bytes   s   D))"-J+5+@+@AS+T+Tv''Z`Zv  !/z7 C CK\Kh%))*;I*FGGGnrE,5999OtT?CCC  qvvxx                                   s   %CC
C
c                     g | ]U}|O|d         ?rt          |d                   sr&t          |d                   r |d                   n	|d         nd VS )Nr,   r-   )r   r   ).0xry   r   rz   s     r"   
<listcomp>z%Pdf.embed_storage.<locals>.<listcomp>  s     
 
 
  =	 z)% **7&	*B*B *HT *YfghiogpYqYq * "M!F),,, 7 
 
 
r5   rg   c                     g | ]I}|Crt          |          sr.t          |          rt          j                            |          n|nd JS r2   )r   r   rC   r-   basename)r   r-   ry   rz   s     r"   r   z%Pdf.embed_storage.<locals>.<listcomp>  s     	 	 	  # $(5d(;(;AMR_`dReReBG$$T*** 	 	 	r5   r-   r,   ri   )r   rk   rn   	to_pylistra   r   rb   rp   rq   rr   r   r.   )r3   re   rX   ry   rz   rv   rw   r   s     ```  @r"   embed_storagezPdf.embed_storage   sI   0 $ "			  		  		  		  
 			  h
 
 
 
 
 
 !**,,
 
 
 
 
 
 X	 	 	 	 	 $MM&11;;==	 	 	 
 
 

 .,,k:-FRXHY`g`o`o`q`q,rr'4<000r5   r2   )NTT)"__name__
__module____qualname____doc__r&   bool__annotations__r   r)   r   r=   r*   r	   rk   structra   rb   r.   r   r0   r4   r   r,   r?   dictrF   r^   r
   rd   StringArrayrp   	ListArrayrx   r    r5   r"   r%   r%      s        ! !F FDd777B777 0E8C=///&RYibikk'R'RSSGXc]SSSu5u===E3===  !E#uiG[*[$\ !ae ! ! ! !F8 8D 8EY 8 8 8 8t
}d33E.FFG 
 
 
 
#1E".".",*V$W #1\^\j #1 #1 #1 #1L osC1 C1~C1LPC1gkC1	C1 C1 C1 C1 C1 C1r5   r%   c                     t          | d          r0t          | j        d          r| j        j        r| j        j        ddS dt          |           dS )aA  
    Encode a pdfplumber.pdf.PDF object into a dictionary.

    If the PDF has an associated file path, returns the path. Otherwise, serializes
    the PDF content into bytes.

    Args:
        pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.

    Returns:
        dict: A dictionary with "path" or "bytes" field.
    r   nameNr8   )hasattrr   r   r#   )r   s    r"   rA   rA   #  s^     sH :'#*f"="= :#*/ :
$777 |C'8'8999r5   )&rC   dataclassesr   r   ior   pathlibr   typingr   r   r	   r
   r   r   pyarrowrk    r   download.download_configr   tabler   utils.file_utilsr   r   r   utils.py_utilsr   r   r;   rc   r   r,   r#   r%   r   rA   r   r5   r"   <module>r      s   				 ( ( ( ( ( ( ( (             F F F F F F F F F F F F F F F F           5 5 5 5 5 5       B B B B B B B B B B C C C C C C C C  &%%%%%%!* !u ! ! ! ! A1 A1 A1 A1 A1 A1 A1 A1H:3 : : : : : : :r5   