
    jn                         d dl mZmZmZmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ eej        ej        df         Z G d d	e
          ZdS )
    )AnyListOptionalUnion)etree)Self)DocumentPage)read_txt_file)loggerNc                       e Zd ZdZ	 	 ddee         def fdZdee	         fdZ
edee	         f fd            Zd	efd
Ze	 	 ddededee         dedef
d            Ze	 	 	 ddededee         dee         dedefd            Z xZS )XMLDocumentzClass for handling .xml documents. This class uses rules based parsing to identify
    sections of interest within the document.N
stylesheetparserc                     |s,|rt          j        d          nt          j        d          }|| _        || _        d| _        t                                                       dS )a  Class for parsing XML documents. XML documents are parsed using lxml.

        Parameters
        ----------
        filename:
            The name of the XML file to read
        stylesheet:
            An XLST stylesheet that can be applied to transform the XML file
        parser:
            The lxml parser to use with the file. The HTML parser is used by default
            because it is more tolerant of special characters and malformed XML. If you
            are using a stylesheet, you likely want the XMLParser.
        T)remove_commentsN)r   	XMLParser
HTMLParserr   r   document_treesuper__init__)selfr   r   	__class__s      d/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/documents/xml.pyr   zXMLDocument.__init__   ss    $  	 <5555%d;;;  %!    returnc                     t           )N)NotImplementedError)r   s    r   _parse_pages_from_element_treez*XMLDocument._parse_pages_from_element_tree/   s    !!r   c                 j    | j         |                                 | _         t                      j        S )z1Gets all elements from pages in sequential order.)_pagesr   r   pages)r   r   s    r   r"   zXMLDocument.pages2   s,     ;==??DKww}r   contentc                 \   t          | j        t          j                  }|r|                    d          s|rd|z   }| j        	 t          j        || j                  }|t          d          n<# t          $ r/ t          j        |                                | j                  }Y nw xY w| j	        rkt          | j        t          j                  rt          j        d           t          j        | j	                  }t          j        |          } ||          }|| _        | j        S )zDReads in an XML file and converts it to an lxml element tree object.
Nzdocument_tree is NonezYou are using the HTML parser with an XSLT stylesheet. Stylesheets are more commonly parsed with the XMLParser. If your HTML does not display properly, try `import lxml.etree as etree` and setting `parser=etree.XMLParser()` instead.)
isinstancer   r   r   
startswithr   
fromstring
ValueErrorencoder   r   warningparseXSLT)r   r#   is_html_parserr   xslt	transforms         r   	_read_xmlzXMLDocument._read_xml9   sC    $DK1ABB 	%7--d33 	% 	%WnG%
P % 0$+ F F ($%<=== )  P P P % 01A1A4; O OP  9dk5+;<< N>   {4?33!Jt,,	 )	- 8 8!.D!!s   +A2 26B+*B+textkwargsc                 n    t          j        d            | d||d|}|                    |           |S )zFSupports reading in an XML file as a raw string rather than as a file.z Reading document from string ...r   r    )r   infor1   )clsr2   r   r   r3   docs         r   from_stringzXMLDocument.from_string`   sH     	6777cAJAA&AAd
r   filenameencodingc                 L    t          ||          \  }} | j        |f||d|S )N)r;   r<   r5   )r   r:   )r8   r;   r   r   r<   r3   _r#   s           r   	from_filezXMLDocument.from_filen   s<     #HxHHH
7swWv*WWPVWWWr   )NN)NNN)__name__
__module____qualname____doc__r   strVALID_PARSERSr   r   r
   r   propertyr"   r1   classmethodr   r   r:   r?   __classcell__)r   s   @r   r   r      s       1 1
 %) $ SM      <"T
 " " " " tDz      X%" %" %" %" %"N  !%$(	   SM	
  
   [  !%$("&
X 
X
X 
X SM	
X
 3-
X 
X 

X 
X 
X [
X 
X 
X 
X 
Xr   r   )typingr   r   r   r   lxmlr   typing_extensionsr   unstructured.documents.baser	   r
    unstructured.file_utils.encodingr   unstructured.loggerr   r   r   rE   r   r6   r   r   <module>rO      s    - - - - - - - - - - - -       " " " " " " 6 6 6 6 6 6 6 6 : : : : : : & & & & & &e&=>lX lX lX lX lX( lX lX lX lX lXr   