
    j!              /           d dl mZmZmZmZmZmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZmZmZ d dlmZ erd dlm Z   e             eej!                   e	            dddddddi dddddddddgddfdee"         deee#                  dee"         dee"         dee"         de$de$dee"e"f         de$dedee"         de$dee"         dee"         de$dee"         d eee"                  d!e$d"ee"         d#ed$ee         f*d%                                    Z%ddddddgddfde"dee"         deee#                  de$dee"         dee"         d eee"                  d!e$d"ee"         d$ee         fd&Z&d*d)Z'dS )+    )IOTYPE_CHECKINGAnyDictListOptionalN)add_chunking_strategy)Elementprocess_metadata)HTMLDocument)VALID_PARSERS)read_txt_file)convert_file_to_html_text)FileTypeadd_metadata_with_filetype)document_to_element_listexactly_oneget_last_modified_date get_last_modified_date_from_file)apply_lang_metadata)DocumentLayoutFTautofilenamefiletexturlencodinginclude_page_breaksinclude_metadataheaders
ssl_verifyparsersource_formathtml_assemble_articlesmetadata_filenamemetadata_last_modifiedskip_headers_and_footerschunking_strategy	languagesdetect_language_per_elementdetection_originkwargsreturnc                 H   | |                                 dk    r|s| s|sg S t          | |||           d}| (t          |           }t          j        | |	||          }n|;t          |          }t          ||          \  }}t          j        ||	|          }n|'t          |          }t          j        ||	|          }n|t          j
        |||          }|j        st          d|j                   |j        
                    d	d          }|                    d
          st          d| d          t          j        |j        |	          }|rt#          |          }t%          t'          t)          |fd||p||
r|
nd|d|||                    S )ai  Partitions an HTML document into its constituent elements.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "r" mode --> open(filename, "r").
    text
        The string representation of the HTML document.
    url
        The URL of a webpage to parse. Only for URLs that return an HTML document.
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    include_page_breaks
        If True, includes page breaks at the end of each page in the document.
    include_metadata
        Optionally allows for excluding metadata from the output. Primarily intended
        for when partition_html is called in other partition bricks (like partition_email)
    headers
        The headers to be used in conjunction with the HTTP request if URL is set.
    ssl_verify
        If the URL parameter is set, determines whether or not partition uses SSL verification
        in the HTTP request.
    parser
        The parser to use for parsing the HTML document. If None, default parser will be used.
    source_format
        The source of the original html. If None we will return HTMLElements but for example
         partition_rst will pass a value of 'rst' so that we return Title vs HTMLTitle
    metadata_last_modified
        The last modified date for the document.
    skip_headers_and_footers
        If True, ignores any content that is within <header> or <footer> tags
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    N )r   r   r   r   )r"   r   assemble_articles)r   r   )r"   r0   )r    verifyzURL return an error: zContent-Typez	text/htmlz%Expected content type text/html. Got .)r"   F)sortabler   last_modification_dater#   r+   )r)   r*   )stripr   r   r   	from_filer   r   from_stringstrrequestsgetok
ValueErrorstatus_coder    
startswithr   filter_footer_and_headerlistr   r   )r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r4   document_	file_text_textresponsecontent_types                              e/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/partition/html.pypartition_htmlrH      s2   D DJJLLB..t.H.UX.	 4SAAAA!!7!A!A)4	
 
 
 
	!A$!G!G$$BBB9+4
 
 
 
	YY+4
 
 
 
<WZHHH{ 	MKX5IKKLLL'++NB??&&{33 	VT\TTTUUU+HM&III 6+H55$$7'='WAW/<Fmm$!1     (C	
 	
 	
      c	                     d}	|rt          |          }	n|rt          |          }	t          | ||          }
t          |
| |d||p|	|||	  	        S )a  Converts a document to HTML and then partitions it using partition_html. Works with
    any file format support by pandoc.

    Parameters
    ----------
    source_format
        The format of the source document, i.e. rst
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    include_page_breaks
        If True, the output will include page breaks if the filetype supports it.
    metadata_filename
        The filename to use in element metadata.
    metadata_last_modified
        The last modified date for the document.
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    N)r#   r   r   unicode)	r   r#   r   r   r%   r&   r)   r*   r+   )r   r   r   rH   )r#   r   r   r   r%   r&   r)   r*   r+   r4   	html_texts              rG   convert_and_partition_htmlrM      s    J " H!7!A!A	 H!A$!G!G)#  I #/+5O9O$?)
 
 
 
rI   rA   r   c                 j    | j         D ]*}t          t          d |j                            |_        +| S )Nc                 &    d| j         vod| j         vS )Nfooterheader)ancestortags)els    rG   <lambda>z*filter_footer_and_header.<locals>.<lambda>   s    82?:^xr?^ rI   )pagesr@   filterelements)rA   pages     rG   r?   r?      sE     
 
^^ 
 
 OrI   )rA   r   r-   r   )(typingr   r   r   r   r   r   r9   unstructured.chunkingr	   unstructured.documents.elementsr
   r   unstructured.documents.htmlr   unstructured.documents.xmlr    unstructured.file_utils.encodingr   'unstructured.file_utils.file_conversionr    unstructured.file_utils.filetyper   r   unstructured.partition.commonr   r   r   r   unstructured.partition.langr   'unstructured_inference.inference.layoutr   HTMLr8   bytesboolrH   rM   r?    rI   rG   <module>rh      s   ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?  7 7 7 7 7 7 E E E E E E E E 4 4 4 4 4 4 4 4 4 4 4 4 : : : : : : M M M M M M                   < ; ; ; ; ; GFFFFFF HM**" $" %!  #'#('+,0%*'+&,X(-&*'{ {sm{
2e9
{ 3-{ 
#	{
 sm{ { { #s(^{ { { C={ !{  }{ %SM{ #{   }!{" S	"#{$ "&%{& sm'{( ){* 
']+{ { {  +* {@ # $ %'+,0&,X(-&*; ;;sm; 2e9
; 	;
  }; %SM; S	"; "&; sm; 
']; ; ; ;|     rI   