
    ja                        U d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlmZ d d	lmZmZ d d
lmZmZmZmZ d dl m!Z! d dl"m#Z# dZ$e%e&d<   	 	 	 	 d"de
e%         de
eee'         ef                  de
e%         de
e%         dee
e%                  f
dZ(	 d#dee%ee'         f         de
e%         dee
e%                  fdZ) e             eej*                   e            dddddddddddgdfde
e%         de
eee'         ef                  de
e%         de+de
e%         de
e%         de+de
e%         de
e%         de
e%         de
e	e%                  d e+de	e         fd!                                    Z,dS )$    N)BytesIO)SpooledTemporaryFile)IOBinaryIOIteratorListOptionalUnioncast)etree)add_chunking_strategy)ElementElementMetadataTextprocess_metadata)read_txt_file)FileTypeadd_metadata_with_filetype)exactly_oneget_last_modified_date get_last_modified_date_from_filespooled_to_bytes_io_if_needed)apply_lang_metadata)element_from_textxmlDETECTION_ORIGINfilenamefiletextxml_pathreturnc           
         t          | ||           | rt          | |          S |rct          t          t                   t          t          t          t          t          f         |                              }t          ||          S t          t	          t          t          |          d                    }t          ||          S )zGGet leaf elements from the XML tree defined in filename, file, or text.r   r   r   )r    zutf-8)encoding)r   _get_leaf_elementsr   r   bytesr   r
   r   r   r   str)r   r   r   r    fbs         d/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/partition/xml.pyget_leaf_elementsr+      s     48888 8!(X>>>>	 
8uI)U8%99:DAA 
 
 "!h7777E$sD//G<<<==!!h7777    c              #     K   g }t          j        | d          }|;t          |          \  }}t          j        |          }d  ||          D             }|D ]\  }}|dk    r|                    |           |dk    r=|j        "|j                                        r	|j        V  |                                 |rJ|d                                         0|	                                 |r|d                                         0dS )z<Parse the XML tree in a memory efficient manner if possible.)startend)eventsNc              3      K   | ]}d |fV  	dS )r/   N ).0els     r*   	<genexpr>z%_get_leaf_elements.<locals>.<genexpr>B   s&      IIBUBKIIIIIIr,   r.   r/   )
r   	iterparsenextXPathappendr   stripclear	getparentpop)r   r    element_stackelement_iterator_elementcompiled_pathevents           r*   r%   r%   4   s6     
 Mt4DEEE *++
7H--II--2H2HIII*    wG  )))E>>|'GL,>,>,@,@'l"""MMOOO 	 b 1 ; ; = = E  	 b 1 ; ; = = E   r,   FTautoxml_keep_tagsmetadata_filenameinclude_metadatar$   metadata_last_modifiedchunking_strategy	languagesdetect_language_per_elementc                    t          | ||           g }d}| rt          |           }n|rt          |          }|r"t          |p| |p|          }t          |_        nt                      }|ry| rt          | |          \  }}nO|rIt          t          t          t          t          f         |                    }t          ||          \  }}n|r|}t          ||          g}nWt          | |||          }|D ]A}|r=t          |          }t          j        |          |_        |                    |           Bt'          t)          ||
|                    }|S )	a  Partitions an XML document into its document elements.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    text
        The text of the XML file.
    xml_keep_tags
        If True, will retain the XML tags in the output. Otherwise it will simply extract
        the text from within the tags.
    xml_path
        The xml_path to use for extracting the text. Only used if xml_keep_tags=False.
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    include_metadata
        Determines whether or not metadata is included in the metadata attribute on the
        elements in the output.
    metadata_last_modified
        The day of the last modification.
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    r#   N)r   last_modified)r   r$   )r   r$   )r   metadata)r   r   r   r    )elementsrK   rL   )r   r   r   r   r   detection_originr   r   r   r
   r   r   r   r+   r   copydeepcopyrO   r:   listr   )r   r   r   rF   r    rG   rH   r$   rI   rJ   rK   rL   kwargsrP   last_modification_daterO   rA   raw_textr(   leaf_elementsleaf_elementrB   s                         r*   partition_xmlrZ   R   s   ` 48888 H! H!7!A!A	 H!A$!G!G %"&2(0J4J
 
 
 %5!!"$$ ) 	'HMMMKAxx 	-U8%99:DAA A (QBBBKAxx 	H h222

 *	
 
 
 * 	) 	)L )+L99#'=#:#: ((((C	
 	
 	
 H Or,   )NNNN)N)-rR   ior   tempfiler   typingr   r   r   r   r	   r
   r   lxmlr   unstructured.chunkingr   unstructured.documents.elementsr   r   r   r    unstructured.file_utils.encodingr    unstructured.file_utils.filetyper   r   unstructured.partition.commonr   r   r   r   unstructured.partition.langr   unstructured.partition.textr   r   r'   __annotations__r&   r+   r%   XMLboolrZ   r2   r,   r*   <module>ri      s          ) ) ) ) ) ) F F F F F F F F F F F F F F F F F F       7 7 7 7 7 7            ; : : : : : Q Q Q Q Q Q Q Q            < ; ; ; ; ; 9 9 9 9 9 9 #    #=A"	8 8sm8
5E$889
:8 3-8 sm	8
 hsm8 8 8 82 #   
RY
 sm  hsm       < HL))"=A"'+!",0'+&,X(-c csmc
5E$889
:c 3-c 	c
 smc  }c c smc %SMc  }c S	"c "&c 
']c c c  *) c c cr,   