
    j-m                        U d Z ddlZddlZddlZddlZddlmZmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+  ej,        dd          Z-g dZ.e
 G d de                      Z/e
 G d dee                      Z0e
 G d de0                      Z1e
 G d de0                      Z2e
 G d d e0                      Z3e
 G d! d"e0                      Z4e
 G d# d$e0                      Z5e
 G d% d&e5                      Z6e
 G d' d(e0                      Z7e
 G d) d*e0                      Z8e
 G d+ d,e0                      Z9e
 G d- d.e0                      Z:dZ;ej<        e/         e=d/<   e
 G d0 d1e0                      Z>e
 G d2 d3e0e                      Z?e
 G d4 d5ee                      Z@ G d6 d7e          ZA G d8 d9e          ZBe
 G d: d;e                      ZCe
 G d< d=eCeAe                      ZDe
 G d> d?eCeBe                      ZEe
 G d@ dAee                      ZFe
 G dB dCeFe                      ZGe
 G dD dEeFe                      ZH G dF dG          ZI G dH dI          ZJ G dJ dK          ZK G dL dM          ZLe
 G dN dO                      ZMdS )Pz^Defines Abstract Base Classes (ABC's) core to batch processing documents
through Unstructured.    N)ABCabstractmethod)	dataclassfield)datetime)Path)DataClassJsonMixin)Json_decode_dataclass)chunk_by_title)DataSourceMetadata)BaseEmbeddingEncoderElement)EnhancedDataClassJsonMixinenhanced_field)_asdict)PartitionErrorSourceConnectionError)logger)partition_via_api)	partition)convert_to_dictflatten_dictAr	   )bound)	s3s3aabfsazgsgcsboxdropboxsftpc                       e Zd ZdZdS )BaseSessionHandlezAbstract Base Class for sharing resources that are local to an individual process.
    e.g., a connection for making a request for fetching documents.N__name__
__module____qualname____doc__     h/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/ingest/interfaces.pyr&   r&   *   s"        G G G Gr-   r&   c                       e Zd ZdS )
BaseConfigNr(   r)   r*   r,   r-   r.   r0   r0   0           Dr-   r0   c                       e Zd ZdS )AccessConfigNr1   r,   r-   r.   r4   r4   5   s         	Dr-   r4   c                   \    e Zd ZU dZdZej        e         ed<   dZ	ej        e
         ed<   dS )RetryStrategyConfiga  
    Contains all info needed for decorator to pull from `self` for backoff
    and retry triggered by exception.

    Args:
        max_retries: The maximum number of attempts to make before giving
            up. Once exhausted, the exception will be allowed to escape.
            The default value of None means there is no limit to the
            number of tries. If a callable is passed, it will be
            evaluated at runtime and its return value used.
        max_retry_time: The maximum total amount of time to try for before
            giving up. Once expired, the exception will be allowed to
            escape. If a callable is passed, it will be
            evaluated at runtime and its return value used.
    Nmax_retriesmax_retry_time)r(   r)   r*   r+   r7   tOptionalint__annotations__r8   floatr,   r-   r.   r6   r6   ;   sK            $(KC'''(,NAJu%,,,,,r-   r6   c                   .   e Zd ZU dZeed<   dZeed<   dZe	j
        e	j        e                  ed<   dZe	j
        e         ed<    ee          Zeed	<   dZe	j
        e	j        e                  ed
<    ed           Ze	j        e         ed<   dZeed<    ee          Ze	j        e         ed<    ee          Ze	j        e         ed<   dZe	j
        e         ed<   dZeed<    edd          Ze	j
        e         ed<   dZe	j
        e         ed<   dS )PartitionConfigFpdf_infer_table_structureautostrategyNocr_languagesencodingdefault_factoryadditional_partition_argsskip_infer_table_typesc                  
    g dS )N)
element_idtexttypemetadata
embeddingsr,   r,   r-   r.   <lambda>zPartitionConfig.<lambda>[   s     X X X r-   fields_includeflatten_metadatametadata_excludemetadata_includez.https://api.unstructured.io/general/v0/generalpartition_endpointpartition_by_apiTdefault	sensitiveapi_keyhi_res_model_name)r(   r)   r*   r@   boolr<   rB   strrC   r9   r:   ListrD   r   dictrG   rH   rP   rQ   listrR   rS   rT   rU   r   rY   rZ   r,   r-   r.   r?   r?   Q   s         ',t+++Hc-1M1:afSk*111 $Hajo$$$&+eD&A&A&AtAAA6:AJqvc{3:::"'%XX# # #NAF3K    #d"""$)E$$?$?$?afSk???$)E$$?$?$?afSk???*Z
3ZZZ"d"""-~ddKKKGQZ_KKK)-qz#-----r-   r?   c                       e Zd ZU dZeed<   dZeed<    e ej	                    dz  dz  dz  dz  
                                          Zeed<   d	Zeed
<   dZeed<   dZeed<   dS )ProcessorConfigF	reprocessverbosez.cacheunstructuredingestpipelinework_dirzstructured-output
output_dir   num_processesraise_on_errorN)r(   r)   r*   rb   r[   r<   rc   r\   r   homeresolverg   rh   rj   r;   rk   r,   r-   r.   ra   ra   f   s         ItGTCx/.@8KjXaaccddHcddd)J)))M3 ND     r-   ra   c                   p    e Zd ZU eed<   dZeed<   dZeed<   dZe	j
        e	j        e                  ed<   dS )FileStorageConfig
remote_urlF
uncompress	recursiveN	file_glob)r(   r)   r*   r\   r<   rq   r[   rr   rs   r9   r:   r]   r,   r-   r.   ro   ro   p   sX         OOOJIt)-Iqz!&+&-----r-   ro   c                       e Zd ZU dZeed<    ed          Zeed<    ed          Z	eed<    ed          Z
eed<    ed          Zeed<   d	efd
Zd ZdS )FsspecConfigNaccess_configF)initprotocolpath_without_protocoldir_path	file_pathreturnc                 J    | j         r| j                             d          S i S )NF)apply_name_overload)rv   to_dictselfs    r.   get_access_configzFsspecConfig.get_access_config   s,     	%--%-HHHIr-   c                 n   | j                             d          \  | _        | _        | j        t          vr t          d| j         dt           d          t          j        | j         d| j                   }|r| j        dk    rd| _        d| _	        d S t          j        | j         d	| j                   }|rC| j        dk    r8|
                    d
          | _        |
                    d          pd| _	        d S t          j        | j         d| j                   }|r#|
                    d
          | _        d| _	        d S t          j        | j         d| j                   }|st          d| j          d          |
                    d
          | _        |
                    d          pd| _	        d S )Nz://z	Protocol z not supported yet, only z are supported.z
://([\s])/r#     z:///([^/\s]+?)/([^\s]*)   ri   z://([^/\s]+?)(/*)$z://([^/\s]+?)/([^\s]*)zInvalid path z6. Expected <protocol>://<dir-path>/<file-or-dir-path>.)rp   splitrx   ry   !SUPPORTED_REMOTE_FSSPEC_PROTOCOLS
ValueErrorrematchrz   r{   group)r   r   s     r.   __post_init__zFsspecConfig.__post_init__   s   48O4I4I%4P4P1t1= AAAFDM F F4F F F   T]666HH 	T]i//DMDNF T]CCCT_UU 	T]i//!KKNNDM"[[^^1rDNF T]>>>PP 	!KKNNDMDNF T]BBBDOTT 	H H H H   AQ-2r-   )r(   r)   r*   rv   r4   r<   r   rx   r\   ry   rz   r{   r^   r   r   r,   r-   r.   ru   ru   x   s         "&M<&&&Eu%%%Hc%%%!&E!2!2!23222Eu%%%Hc%%%U&&&Is&&&4    %. %. %. %. %.r-   ru   c                   l    e Zd ZU dZeed<   dZeed<   dZeed<   dZ	eed<   dZ
ej        e         ed<   dS )	
ReadConfigr   download_dirFre_downloadpreserve_downloadsdownload_onlyNmax_docs)r(   r)   r*   r   r\   r<   r   r[   r   r   r   r9   r:   r;   r,   r-   r.   r   r      sk          L#K$$$$M4 $Hajo$$$$$r-   r   c                       e Zd ZU eed<    edd          Zej        e         ed<   dZ	ej        e         ed<   de
fdZdS )	EmbeddingConfigproviderNTrV   rY   
model_namer|   c                    i }| j         r
| j         |d<   | j        r
| j        |d<   | j        dk    rddlm}m}  | |d
i |          S | j        dk    rddlm}m}  | |d
i |          S t          | j         d	          )NrY   r   zlangchain-openair   )OpenAiEmbeddingConfigOpenAIEmbeddingEncoder)configzlangchain-huggingface)HuggingFaceEmbeddingConfigHuggingFaceEmbeddingEncoderz not a recognized encoderr,   )
rY   r   r   unstructured.embed.openair   r   unstructured.embed.huggingfacer   r   r   )r   kwargsr   r   r   r   s         r.   get_embedderzEmbeddingConfig.get_embedder   s    < 	- $F9? 	3#'?F< =...________))1F1F1P1P1P1PQQQQ]555       
 /.6P6P6Z6ZSY6Z6Z[[[[HHHIIIr-   )r(   r)   r*   r\   r<   r   rY   r9   r:   r   r   r   r,   r-   r.   r   r      s}         MMM-~ddKKKGQZ_KKK"&J
3&&&J2 J J J J J Jr-   r   c                       e Zd ZU dZeed<   dZeed<   dZeed<   dZ	eed<   d	Z
ej        e         ed
<   dej        e         dej        e         fdZd	S )ChunkingConfigFchunk_elementsTmultipage_sectionsi  combine_text_under_n_charsi  max_charactersNnew_after_n_charselementsr|   c                 d    | j         r(t          || j        | j        | j        | j                  S |S )N)r   r   r   r   r   )r   r   r   r   r   r   )r   r   s     r.   chunkzChunkingConfig.chunk   sF     		!!#'#:+/+J#2"&"8    Or-   )r(   r)   r*   r   r[   r<   r   r   r;   r   r   r9   r:   r]   r   r   r,   r-   r.   r   r      s          ND   ####&))))NC)-qz#---
afWo 
!&/ 
 
 
 
 
 
r-   r   c                       e Zd ZU  ed          Zej        e         ed<    ed          Z	ej        e         ed<    eddd	          Z
ej        e         ed
<   dS )PermissionsConfigpermissions_application_id)overload_nameapplication_idpermissions_tenanttenantNTpermissions_client_cred)rW   rX   r   client_cred)r(   r)   r*   r   r   r9   r:   r\   r<   r   r   r,   r-   r.   r   r      s         &4nC_&`&`&`NAJsO```,n;OPPPFAJsOPPP#1>4M$ $ $KC     r-   r   global_write_session_handlec                       e Zd ZdS )WriteConfigNr1   r,   r-   r.   r   r      r2   r-   r   c                       e Zd ZdZdS )BaseConnectorConfigzEAbstract definition on which to define connector-specific attributes.Nr'   r,   r-   r.   r   r      s        OOOOr-   r   c                   "   e Zd ZU dZej        e         ed<   dZej        e         ed<   dZ	ej        e         ed<   dZ
ej        e         ed<   dZej        e         ed<   dZej        ej        ej        eej        f                           ed<   dS )SourceMetadataNdate_createddate_modifiedversion
source_urlexistspermissions_data)r(   r)   r*   r   r9   r:   r\   r<   r   r   r   r   r[   r   r]   DictAnyr,   r-   r.   r   r      s         $(L!*S/(((%)M1:c?)))#GQZ_###"&J
3&&&#FAJt###?CajsAEz(:!;<CCCCCr-   r   c            
            e Zd ZdZg dZg dZdedej        e	         fdZ
dej        e	ef         fdZed	d
ddej        e         dededef fd            Z xZS )IngestDocJsonMixin
    Inherently, DataClassJsonMixin does not add in any @property fields to the json/dict
    created from the dataclass. This explicitly sets properties to look for on the IngestDoc
    class when creating the json/dict for serialization purposes.
    )r   r   date_processedr   r   r   r   )base_filenamefilename_output_filenamerecord_locator_source_metadata	unique_idas_dictpropsc                     |D ]f}t          | |          }t          |t                    rt          |          }t          |t                    r|                    d          }|||<   gd S NFencode_jsongetattr
isinstancer   r\   r	   r   r   r   r   propvals        r.   	add_propszIngestDocJsonMixin.add_props#  x     	  	 D$%%C#t$$ #hh#122 5kkek44GDMM	  	 r-   r|   c                     t          | fi |}|                     || j                   t          | d          |                     || j                   |S )Nr   r   r   )r   r   properties_to_serializer   metadata_properties)r   r   r   s      r.   r   zIngestDocJsonMixin.to_dict,  s_    $))&))wd.JKKK4+,,8NN7$2JNKKKr-   FT)infer_missingr~   clskvsr~   c                    t                                          |||          }|                    d          x}r)t          |dt                              |                     |                    d          x}rt          |d|           |S )N)r   r   r~   r   _date_processed)super	from_dictgetsetattrr   )r   r   r   r~   docmetar   	__class__s          r.   r   zIngestDocJsonMixin.from_dict3  s     gg=FY   
 
 77-...4 	MC+^-E-Ed-K-KLLL WW%6777> 	<C*N;;;
r-   )r(   r)   r*   r+   r   r   r^   r9   r]   r\   r   r   r
   r   classmethodTyper   r[   r   __classcell__)r   s   @r.   r   r   
  s                afSk        16#t)#4     49W[
 
 
VAY
!
PT
	

 
 
 
 
 [
 
 
 
 
r-   r   c                       e Zd ZdZdgZdedej        e         fdZ	ddej
        eef         fdZedd	d
ej        e         dedefd            ZdS )BatchIngestDocJsonMixinr   r   r   r   c                     |D ]f}t          | |          }t          |t                    rt          |          }t          |t                    r|                    d          }|||<   gd S r   r   r   s        r.   r   z!BatchIngestDocJsonMixin.add_propsJ  r   r-   Fr|   c                 `    t          | |          }|                     || j                   |S )Nr   r   )r   r   r   )r   r   r   s      r.   r   zBatchIngestDocJsonMixin.to_dictS  s2    $K888wd.JKKKr-   )r   r   r   c                (    t          | ||          }|S N)r   )r   r   r   r   s       r.   r   z!BatchIngestDocJsonMixin.from_dictX  s    S-88
r-   N)F)r(   r)   r*   r+   r   r^   r9   r]   r\   r   r   r
   r   r   r   r   r   r,   r-   r.   r   r   A  s           +m   afSk         AF39,=    
 >C   qvay t     [  r-   r   c                   Z    e Zd ZU eed<   eed<   eed<   eede	fd                        Z
dS )BaseIngestDocprocessor_configread_configconnector_configr|   c                     d S r   r,   r   s    r.   r   zBaseIngestDoc.unique_idd  s	     	r-   N)r(   r)   r*   ra   r<   r   r   propertyr   r\   r   r,   r-   r.   r   r   ^  si         %%%%))))3    ^ X  r-   r   c                      e Zd ZU dZ edd          Zej        e         e	d<    edd          Z
ej        e         e	d<   edefd            Zej        d	efd
            Zedej        e         fd            Zedej        e         fd            Zedej        e         fd            Zedej        e         fd            Zeed                         Zedej        e         fd            Zedej        e         fd            Zeed                         Zedej        ej        eej        f                  fd            Zedefd            Zedej        e         fd            Zedej        e         fd            Zedej        ej        ej        eej        f                           fd            Z ed             Z!e"d             Z#d!dZ$d Z%ee&j'        d                         Z(defdZ)e*j'        de+dej        e,         fd            Z-de+dej        ej        ej        eej        f                           fd Z.dS )"BaseSingleIngestDocam  An "ingest document" is specific to a connector, and provides
    methods to fetch a single raw document, store it locally for processing, any cleanup
    needed after successful processing of the doc, and the ability to write the doc's
    structured outputs once processed.

    Crucially, it is not responsible for the actual processing of the raw document.
    FN)rw   rW   r   r   r|   c                 r    | j         |                                  | j         t          d          | j         S )Nzfailed to set source metadata)r   update_source_metadatar   r   s    r.   source_metadataz#BaseSingleIngestDoc.source_metadataw  s>     (''))) (<===$$r-   valuec                     || _         d S r   )r   )r   r  s     r.   r  z#BaseSingleIngestDoc.source_metadata  s     %r-   c                     | j         j        S )z7The date the document was created on the source system.)r  r   r   s    r.   r   z BaseSingleIngestDoc.date_created  s     #00r-   c                     | j         j        S )z=The date the document was last modified on the source system.)r  r   r   s    r.   r   z!BaseSingleIngestDoc.date_modified  s     #11r-   c                     | j         S )zThe date the document was last processed by Unstructured.
        self._date_processed is assigned internally in self.partition_file())r   r   s    r.   r   z"BaseSingleIngestDoc.date_processed  s     ##r-   c                     | j         j        S )z1Whether the document exists on the remote source.)r  r   r   s    r.   r   zBaseSingleIngestDoc.exists  s     #**r-   c                     dS zEThe local filename of the document after fetching from remote source.Nr,   r   s    r.   r   zBaseSingleIngestDoc.filename        r-   c                     | j         j        rk| j        rdt          t	          | j         j                                                            }t          | j                  }|                    |d          }|S d S Nr   )r   r   r   r\   r   rm   replace)r   download_path	full_path	base_paths       r.   r   z!BaseSingleIngestDoc.base_filename  so    ( 	T] 	T%5%B C C K K M MNNMDM**I!))-<<Itr-   c                     | j         j        rk| j        rdt          t	          | j         j                                                            }t          | j                  }|                    |d          }|S d S r  )r   rh   r   r\   r   rm   r  )r   output_pathr  r  s       r.   base_output_filenamez(BaseSingleIngestDoc.base_output_filename  sq     + 	0E 	d4#8#CDDLLNNOOKD122I!))+r::Itr-   c                     dS )z/Filename of the structured output for this doc.Nr,   r   s    r.   r   z$BaseSingleIngestDoc._output_filename  r
  r-   c                     dS )zdA dictionary with any data necessary to uniquely identify the document on
        the source system.Nr,   r   s    r.   r   z"BaseSingleIngestDoc.record_locator  s	     tr-   c                     | j         S r   )r   r   s    r.   r   zBaseSingleIngestDoc.unique_id  s
    }r-   c                     | j         j        S )zThe url of the source document.)r  r   r   s    r.   r   zBaseSingleIngestDoc.source_url  s     #..r-   c                     | j         j        S )zThe version of the source document, this could be the last modified date, an
        explicit version number, or anything else that can be used to uniquely identify
        the version of the document.)r  r   r   s    r.   r   zBaseSingleIngestDoc.version  s    
 #++r-   c                 P    | j         |                                  | j         j        S )zHAccess control data, aka permissions or sharing, from the source system.)r  r   r   r   s    r.   r   z$BaseSingleIngestDoc.permissions_data  s+     ''')))#44r-   c                     dS )zORemoves the local copy the file (or anything else) after successful processing.Nr,   r   s    r.   cleanup_filez BaseSingleIngestDoc.cleanup_file  r
  r-   c                 F     t          j                    fd            }|S )zDecorator that checks if a file exists, is not empty, and should not re-download,
        if so log a message indicating as much and skip the decorated function.c                     | j         j        s]| j                                        rD| j                                        j        r&t          j        d| j         dj                    d S  | g|R i |S )NzFile exists: z, skipping )	r   r   r   is_filestatst_sizer   debugr(   )r   argsr   funcs      r.   wrapperz8BaseSingleIngestDoc.skip_if_file_exists.<locals>.wrapper  s     $0M))++ M&&((0
 VT]VVt}VVWWWt4.t...v...r-   )	functoolswraps)r#  r$  s   ` r.   skip_if_file_existsz'BaseSingleIngestDoc.skip_if_file_exists  s:    
 
			/ 	/ 	/ 	/ 
		/ r-   c                 ,    t                      | _        dS )z7Sets the SourceMetadata and the  properties for the docN)r   r   )r   r   s     r.   r   z*BaseSingleIngestDoc.update_source_metadata  s     . 0 0r-   c                     d| _         dS )zSets the _permissions_data property for the doc.
        This property is later used to fill the corresponding SourceMetadata.permissions_data field,
        and after that carries on to the permissions_data property.N)_permissions_datar   s    r.   update_permissions_dataz+BaseSingleIngestDoc.update_permissions_data  s     >Br-   c                     dS )zAFetches the "remote" doc and stores it locally on the filesystem.Nr,   r   s    r.   get_filezBaseSingleIngestDoc.get_file  r
  r-   c                 p    | j                                         o| j                                         j        S )z;Determine if structured output for this doc already exists.)r   r  r  r   r   s    r.   
has_outputzBaseSingleIngestDoc.has_output  s.    $,,..W43H3M3M3O3O3WWr-   partition_configc                    |j         smt          j        d           t          dt	          | j                  t          | j        | j        | j	        | j
        | j        | j        | j                  d|}nd|j        }t          j        d| d           d |                                D             }t!          dt	          | j                  |j        |d|}|S )	NzUsing local partition)urlr   r   r   r   r   r   )r   data_source_metadatazUsing remote partition ()c                 8    i | ]\  }}||t          |          S r   )r\   ).0kvs      r.   
<dictcomp>z6BaseSingleIngestDoc.partition_file.<locals>.<dictcomp>  s,     , , ,"a!-3q66---r-   )r   rY   api_urlr,   )rU   r   r!  r   r\   r   r   r   r   r   r   r   r   r   rT   itemsr   rY   )r   r0  partition_kwargsr   endpointpassthrough_partition_kwargss         r.   partition_filez"BaseSingleIngestDoc.partition_file  s     0 	L0111  T]++%7 L#'#6!%!2"&"4#'#6%)%:& & &  # HH (:HL?H???@@@, ,&6&<&<&>&>, , ,( ) T]++(0   /	 H r-   c                    t          j                                                    | _        | j        j        rd S t          j        d| j                     | j	        d
d|i|}t          |          }g | _        |D ]}|j        r|j        rt          d          |j        r|j        }|D ]u}d|v rS|                    d          }|}	|d d         D ]}
|
|	v r|	|
         }	|d         }||	v r|	                    |d            Y|d                             |d            vnX|j        rQ|j        t#          |d                                                   D ]"}|vr|d                             |d            #|j        fd|                                D             }|j        r>d|v r:|                    d          }|                    t/          |dg	                     | j                            |           | j        S )NzProcessing r0  z_Arguments `--metadata-include` and `--metadata-exclude` are mutually exclusive with each other..rM   c                 $    i | ]\  }}|v 	||S r,   r,   )r6  r7  r8  in_lists      r.   r9  z4BaseSingleIngestDoc.process_file.<locals>.<dictcomp>N  s$    BBBTQQ'\\Aq\\\r-   data_source_record_locator)keys_to_omitr,   )r   utcnow	isoformatr   r   r   r   infor   r?  r   isd_elems_no_filenamerR   rS   r   r   popr_   keysrP   r;  rQ   updater   append)r   r0  r<  isd_elems_raw	isd_elemselemex_listexnested_fieldscurrent_elemffield_to_excluder7  rM   rD  s                 @r.   process_filez BaseSingleIngestDoc.process_file%  sx   
  (00::<<) 	41$-11222++bb=MbQabb#M22	AC" !	4 !	4D0 65E5V 6 :   "2 6*;! 7 7Bbyy(*'+!.ss!3 ? ?A L00/;A+8+<(+|;;(,,-=tDDDZ(,,R66667 "2 6*;d:.335566 6 6A''Z(,,Q555&5GBBBBTZZ\\BBBD0 aZ45G5G88J//LA]@^___```&--d3333))r-   )r|   N)/r(   r)   r*   r+   r   r   r9   r:   r   r<   r   r\   r   r  setterr   r   r   r[   r   r   r   r   r  r   r   r   r   r   r   r   r]   r   r  staticmethodr'  r   r+  r   wrapr-  r/  r   r?   r   r?  rX  r,   r-   r.   r   r   j  sA          495eT3R3R3Raj0RRR',u%'F'F'FOQZ_FFF% % % % X% &^ & & & & 1ajo 1 1 1 X1 2qz# 2 2 2 X2 $
3 $ $ $ X$
 +
4( + + + X+ T T ^ XT qz#    X ajo    X > > ^ X> 
16#qu*+= >    X
 3    X /AJsO / / / X/ ,C , , , X, 5!*QVAF3:4F-G"H 5 5 5 X5 ^ ^ ^^   \$1 1 1 1B B B P P   ^PXD X X X X $)$ 
	$ $ $ $L1*)1* 
AF16#qu*-.	/	1* 1* 1* 1* 1* 1*r-   r   c                   x    e Zd ZU  ee          Zej        e         e	d<   e
ej        d                         ZdS )BaseIngestDocBatchrE   ingest_docsc                     dS )zBFetches the "remote" docs and stores it locally on the filesystem.Nr,   r   s    r.   	get_fileszBaseIngestDocBatch.get_files]  r
  r-   N)r(   r)   r*   r   r_   r^  r9   r]   r   r<   r   r   r[  r`  r,   r-   r.   r]  r]  Y  sg         /4uT/J/J/JK+,JJJQ Q   ^Q Q Qr-   r]  c                   $    e Zd Zed             ZdS )BaseConnectorc                     d S r   r,   r   s    r.   check_connectionzBaseConnector.check_connectione      r-   N)r(   r)   r*   r   rd  r,   r-   r.   rb  rb  c  s-          ^  r-   rb  c                   v    e Zd ZU dZeed<   eed<   eed<   ed	d            Z	ed             Z
ed             ZdS )
BaseSourceConnectorzPAbstract Base Class for a connector to a remote source, e.g. S3 or Google Drive.r   r   r   Nc                     dS )zAny additional cleanup up need after processing is complete. E.g., removing
        temporary download dirs that are empty.

        By convention, documents that failed to process are typically not cleaned up.Nr,   )r   cur_dirs     r.   cleanupzBaseSourceConnector.cleanupr  r
  r-   c                     dS )zInitializes the connector. Should also validate the connector is properly
        configured: e.g., list a single a document from the source.Nr,   r   s    r.   
initializezBaseSourceConnector.initializey  r
  r-   c                     dS )zReturns all ingest docs (derived from BaseIngestDoc).
        This does not imply downloading all the raw documents themselves,
        rather each IngestDoc is capable of fetching its content (in another process)
        with IngestDoc.get_file().Nr,   r   s    r.   get_ingest_docsz#BaseSourceConnector.get_ingest_docs~  r
  r-   r   )r(   r)   r*   r+   ra   r<   r   r   r   rj  rl  rn  r,   r-   r.   rg  rg  j  s         ZZ%%%%))))Y Y Y ^Y G G ^G & & ^& & &r-   rg  c                       e Zd ZU eed<   eed<   dedefdZed             Zede	j
        e         ddfd            Zed	e	j
        e	j        ee	j        f                  ddfd
            Zde	j
        e         ddfdZdS )BaseDestinationConnectorwrite_configr   c                 "    || _         || _        d S r   )rq  r   )r   rq  r   s      r.   __init__z!BaseDestinationConnector.__init__  s    ( 0r-   c                     dS )z]Initializes the connector. Should also validate the connector is properly
        configured.Nr,   r   s    r.   rl  z#BaseDestinationConnector.initialize  r
  r-   docsr|   Nc                     d S r   r,   )r   ru  s     r.   writezBaseDestinationConnector.write  re  r-   elements_dictc                    d S r   r,   )r   rx  r"  r   s       r.   
write_dictz#BaseDestinationConnector.write_dict  re  r-   r   c                 <    d |D             } | j         |d|i| d S )Nc                 6    g | ]}|                                 S r,   )r   )r6  es     r.   
<listcomp>z;BaseDestinationConnector.write_elements.<locals>.<listcomp>  s     777777r-   rx  )rz  )r   r   r"  r   rx  s        r.   write_elementsz'BaseDestinationConnector.write_elements  s7    77h777E]EfEEEEEr-   )r(   r)   r*   r   r<   r   rs  r   rl  r9   r]   r   rw  r   r\   r   rz  r   r  r,   r-   r.   rp  rp    s        ))))1[ 1DW 1 1 1 1   ^ !&!45 $    ^ qvafS!%Z6H/I X\    ^Fqvg FD F F F F F Fr-   rp  c                   "    e Zd ZU eed<   ddZdS )SourceConnectorCleanupMixinr   Nc                 >   | j         j        s| j         j        rdS || j         j        }|!t	          |                                          sdS t          j        |          }t          j        |           |D ]U}t          j	        
                    |          r4t          j	                            |          s|                     |           Vt          j        d           t          t          j        |                    dk    rt          j        |           dS dS )z6Recursively clean up downloaded files and directories.N..r   )r   r   r   r   r   is_diroslistdirchdirpathisdirislinkrj  lenrmdir)r   ri  sub_dirssub_dirs       r.   rj  z#SourceConnectorCleanupMixin.cleanup  s   . 	$2B2P 	F?&3G?$w--"6"6"8"8?F:g&&
 	& 	&Gw}}W%% &bgnnW.E.E &W%%%
rz'""##q((HW )(r-   r   )r(   r)   r*   r   r<   rj  r,   r-   r.   r  r    s6              r-   r  c                   "    e Zd ZU eed<   ddZdS )PermissionsCleanupMixinr   Nc                 *   d }	 |t          | j        j        d          }t          |                                          sd S t          |                                          r|}t          j        |           d S t          j        |          }t          j        |           |D ]6}t
          j	        
                    |          s|                     |           7t          j        d            ||          rt          j        |           d S d S )Nc                 j      fdt          j                   D             }t          |          dk    S )Nc                     g | ]A}t           j                            t           j                            |                    ?|BS r,   )r  r  r  join)r6  itemfolder_paths     r.   r~  zWPermissionsCleanupMixin.cleanup_permissions.<locals>.has_no_folders.<locals>.<listcomp>  sM       7==k4!@!@AA  r-   r   )r  r  r  )r  folderss   ` r.   has_no_folderszCPermissionsCleanupMixin.cleanup_permissions.<locals>.has_no_folders  sH       J{33  G
 w<<1$$r-   r   r  )r   r   rh   r   r  r  remover  r  r  r  cleanup_permissionsr  )r   ri  r  cur_filer  r  s         r.   r  z+PermissionsCleanupMixin.cleanup_permissions  s   	% 	% 	% 	E?40;=OPPGG}}##%% 	F==  "" 	HIhF:g&&
 	2 	2G7>>'** 2((111
>'"" 	HW	 	r-   r   )r(   r)   r*   ra   r<   r  r,   r-   r.   r  r    s6         %%%%     r-   r  c                   F    e Zd ZU eed<   eed                         Zd ZdS )IngestDocCleanupMixinr   c                     dS r	  r,   r   s    r.   r   zIngestDocCleanupMixin.filename  r
  r-   c                     | j         j        sW| j                                        r@| j         j        s6t          j        d|             t          j        | j                   dS dS dS dS )z?Removes the local copy of the file after successful processing.zCleaning up N)	r   r   r   r  r   r   r!  r  unlinkr   s    r.   r  z"IngestDocCleanupMixin.cleanup_file  s      3	%%%''	% $2	%
 L...///Idm$$$$$	% 	% 	% 	% 	% 	%r-   N)	r(   r)   r*   r   r<   r   r   r   r  r,   r-   r.   r  r    sW         T T ^ XT% % % % %r-   r  c                   *    e Zd Zedefd            ZdS )ConfigSessionHandleMixinr|   c                     dS )zCreates a session handle that will be assigned on each IngestDoc to share
        session related resources across all document handling for a given subprocess.Nr,   r   s    r.   create_session_handlez.ConfigSessionHandleMixin.create_session_handle  r
  r-   N)r(   r)   r*   r   r&   r  r,   r-   r.   r  r    sA        Z'8 Z Z Z ^Z Z Zr-   r  c                       e Zd ZU eed<    edd          Zej        e	         ed<   e
d             Zej        de	fd            ZdS )	IngestDocSessionHandleMixinr   NF)rW   rw   _session_handlec                 Z    | j         | j                                        | _         | j         S )zFIf a session handle is not assigned, creates a new one and assigns it.)r  r   r  r   s    r.   session_handlez*IngestDocSessionHandleMixin.session_handle  s-     '#'#8#N#N#P#PD ##r-   r  c                     || _         d S r   )r  )r   r  s     r.   r  z*IngestDocSessionHandleMixin.session_handle  s    -r-   )r(   r)   r*   r  r<   r   r  r9   r:   r&   r   r  rY  r,   r-   r.   r  r    s         ....5:U4e5T5T5TOQZ 12TTT$ $ X$ .-> . . . . . .r-   r  )Nr+   r%  r  r   typingr9   abcr   r   dataclassesr   r   r   pathlibr   dataclasses_jsonr	   dataclasses_json.corer
   r   unstructured.chunking.titler   unstructured.documents.elementsr   unstructured.embed.interfacesr   r   &unstructured.ingest.enhanced_dataclassr   r   +unstructured.ingest.enhanced_dataclass.corer   unstructured.ingest.errorr   r   unstructured.ingest.loggerr   unstructured.partition.apir   unstructured.partition.autor   unstructured.staging.baser   r   TypeVarr   r   r&   r0   r4   r6   r?   ra   ro   ru   r   r   r   r   r   r:   r<   r   r   r   r   r   r   r   r]  rb  rg  rp  r  r  r  r  r  r,   r-   r.   <module>r     s         				 				     # # # # # # # # ( ( ( ( ( ( ( (             / / / / / / 9 9 9 9 9 9 9 9 6 6 6 6 6 6 > > > > > > G G G G G G G G ] ] ] ] ] ] ] ] ? ? ? ? ? ? K K K K K K K K - - - - - - 8 8 8 8 8 8 1 1 1 1 1 1 C C C C C C C CAIc-...
% 
% 
% ! G G G G G G G G
 	 	 	 	 	+S 	 	 	 	 	 	 	 	: 	 	 	
 - - - - -* - - -* . . . . .j . . .( ! ! ! ! !j ! ! ! . . . . .
 . . . 2. 2. 2. 2. 2.$ 2. 2. 2.j % % % % % % % % J J J J Jj J J J6     Z   (     
    >B QZ(9: A A A 	 	 	 	 	* 	 	 	 P P P P P*c P P P D D D D D/ D D D4 4 4 4 43 4 4 4n    8   :     C    k* k* k* k* k*-);S k* k* k*\ Q Q Q Q Q(? Q Q Q     .    & & & & &- & & &6 F F F F F}c F F F4       ,       @% % % % % % % %&Z Z Z Z Z Z Z Z . . . . . . . . . .r-   