
    jW              6       .   U d dl Z d dlmZmZmZmZmZmZ d dlZd dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ i Z%ee&ef         e'd<    e$d          rd dl(m)Z) d dl*m+Z+ e)e%d<   e+e%d<    e$d          rd dl,m-Z- d dl.m/Z/ e-e%d<   e/e%d<    e$d          r e$d          rd dl0m1Z1 e1e%d<    e$d          rd dl2m3Z3 e3e%d<    e$d          r!d dl4m5Z5 d dl6m7Z7 d d l8m9Z9 e5e%d!<   e7e%d"<   e9e%d#<    e$d$          rd d%l:m;Z; e;e%d&<    e$d'          rd d(l<m=Z= e=e%d)<   g d*Z> e?d+ e>D                       rd d,l@mAZA eAe%d-<    e$d.          rd d/lBmCZC eCe%d0<    e$d1          rd d2lDmEZE d d3lFmGZG eEe%d4<   eGe%d1<    e$d          r e$d5          rd d6lHmIZI eIe%d7<   	 d]d8e&d9eee&ef                  fd:ZJdddddd;e jK        ddi g d<d=ddd;d;d;ddd;d;dddddfd>ee&         d?ee&         d@eeeL                  dAee&         dBee&         dCeMdDe&dEee&         dFeee&ge&f                  dGee&e&f         dHee&         dIeMdJee&         dKeee&                  dLeMdMeMdNeMdOeee&                  dPee&         dQeMdReMdSee
         dTee&         dUeeN         dVee&         dWee&         f4dXZOdi d=dfdBe&d?ee&         dGee&e&f         dIeMdUeeN         dYee jP        ee         f         fdZZQd[ee         dHee&         dMeMdYeMfd\ZRdS )^    N)IOCallableDictListOptionalTuple)DataSourceMetadata)FILETYPE_TO_MIMETYPESTR_TO_FILETYPEFileTypedetect_filetypeis_json_processable)logger)exactly_one)partition_email)partition_html)partition_json)&convert_old_ocr_languages_to_languages)partition_text)PartitionStrategy)partition_xmldependency_existsPARTITION_WITH_EXTRAS_MAPpandas)partition_csv)partition_tsvcsvtsvdocx)partition_doc)partition_docxdocpypandoc)partition_odtodt)partition_epubepub)partition_org)partition_rst)partition_rtforgrstrtfmarkdown)partition_mdmd
msg_parser)partition_msgmsg)	pdf2imagepdfminerPILc              #   4   K   | ]}t          |          V  d S Nr   ).0deps     e/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/partition/auto.py	<genexpr>r=   P   s+      55#555555    )partition_pdfpdfunstructured_inference)partition_imageimagepptx)partition_ppt)partition_pptxpptopenpyxl)partition_xlsxxlsxdoc_typepartition_with_extras_mapc           	      x    |t           }|                    |           }|t          d|  d|  d|  d          |S )N
partition_z is not available. Install the z- dependencies with pip install "unstructured[z]")r   getImportError)rK   rL   _partition_funcs      r<   _get_partition_with_extrasrR   j   ss     !($=!/33H==O6 6 6#6 6)16 6 6
 
 	

 r>   F)r@   jpgpngxlsrJ   Tfilenamecontent_typefilefile_filenameurlinclude_page_breaksstrategyencodingparagraph_grouperheadersskip_infer_table_types
ssl_verifyocr_languages	languagesdetect_language_per_elementpdf_infer_table_structurepdf_extract_imagespdf_extract_element_typespdf_image_output_dir_pathpdf_extract_to_payloadxml_keep_tagsdata_source_metadatametadata_filenamerequest_timeouthi_res_model_name
model_namec                 $   t          || |           |r|rt          d          ||}t          j        d           |                    d|           |dk    rd}|4|t          d          t          |          }t          j        d           |t          |||	||	          \  }}n.|	i k    rt          j        d
           t          | ||||          }||	                    d           t          ||
|          }|t          j        k    rt          d          } |d,| ||||d|}n|t          j        k    rt          d          } |d,| ||||d|}n|t          j        k    rt          d          }  | d,| ||||d|}n|t          j        k    rt#          d,| ||||d|}nw|t          j        k    rt          d          }! |!d,| |||d|}nI|t          j        k    rt)          d,| |||||d|}n$|t          j        k    rt-          d,| |||||d|}n|t          j        k    r t          d          }" |"d,| |||||d|}n|t          j        k    rt          d          }# |#d,| ||||d|}n|t          j        k    r t          d          }$ |$d,| |||||d|}np|t          j        k    r t          d          }% |%d,| |||||d|}n@|t          j        k    r(t          d          }& |&d,| |d|||||||||p|d|}n|t          j        k    s |t          j        k    s|t          j        k    rt?          d,| |d|||||p|d|}n|t          j         k    rtC          d,| |||||d|}n|t          j"        k    r t          d           }' |'d,| |||||d|}nj|t          j#        k    r t          d!          }( |(d,| |||||d|}n:|t          j$        k    r t          d"          }) |)d,| |||||d|}n
|t          j%        k    r0tM          | |#          st          d$          tO          d,| |d#|}n|t          j(        k    s|t          j)        k    rt          d%          }* |*d,| ||||d|}n|t          j*        k    rt          d&          }+ |+d,| ||||d|}n^|t          j+        k    rt          d'          }, |,d,| |||d|}n1|t          j,        k    rg }n| sd(nd)|  }-t          |- d*| d+          |D ]c}.||.j-        _.        ||.j-        _/        |0ta          j1        |          }/|/td          |/         nd|.j-        _3        Ltd          |         |.j-        _3        d|S )-ap  Partitions a document into its constituent elements. Will use libmagic to determine
    the file's type and route it to the appropriate partitioning function. Applies the default
    parameters for each partitioning function. Use the document-type specific partitioning
    functions if you need access to additional kwarg options.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    content_type
        A string defining the file content in MIME type
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    metadata_filename
        When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
    url
        The url for a remote document. Pass in content_type if you want partition to treat
        the document as a specific content_type.
    include_page_breaks
        If True, the output will include page breaks if the filetype supports it
    strategy
        The strategy to use for partitioning PDF/image. Uses a layout detection model if set
        to 'hi_res', otherwise partition simply extracts the text from the document
        and processes it.
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    headers
        The headers to be used in conjunction with the HTTP request if URL is set.
    skip_infer_table_types
        The document types that you want to skip table extraction with.
    ssl_verify
        If the URL parameter is set, determines whether or not partition uses SSL verification
        in the HTTP request.
    languages
        The languages present in the document, for use in partitioning and/or OCR. For partitioning
        image or pdf documents with Tesseract, you'll first need to install the appropriate
        Tesseract language pack. For other partitions, language is detected using naive Bayesian
        filter via `langdetect`. Multiple languages indicates text could be in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    pdf_infer_table_structure
        If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
        additional metadata field, "text_as_html," where the value (string) is a just a
        transformation of the data into an HTML <table>.
        The "text" field for a partitioned Table Element is always present, whether True or False.
    pdf_extract_images
        Only applicable if `strategy=hi_res`.
        If True, any detected images will be saved in the path specified by 'image_output_dir_path'
        or stored as base64 encoded data within metadata fields.
        Deprecation Note: This parameter is marked for deprecation. Future versions will use
        'extract_element_types' for broader extraction capabilities.
    pdf_extract_element_types
        Only applicable if `strategy=hi_res`.
        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
        saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
        within metadata fields.
    pdf_extract_to_payload
        Only applicable if `strategy=hi_res`.
        If True, images of the element type(s) defined in 'extract_element_types' will be encoded
        as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
        This parameter facilitates the inclusion of element data directly within the payload,
        especially for web-based applications or APIs.
    pdf_image_output_dir_path
        Only applicable if `strategy=hi_res` and `pdf_extract_to_payload=False`.
        The filesystem path for saving images of the element type(s)
        specified in 'extract_element_types'.
    xml_keep_tags
        If True, will retain the XML tags in the output. Otherwise it will simply extract
        the text from within the tags. Only applies to partition_xml.
    request_timeout
        The timeout for the HTTP request if URL is set. Defaults to None meaning no timeout and
        requests will block indefinitely.
    hi_res_model_name
        The layout detection model used when partitioning strategy is set to `hi_res`.
    model_name
        The layout detection model used when partitioning strategy is set to `hi_res`. To be
        deprecated in favor of `hi_res_model_name`.
    )rX   rV   rZ   zOnly one of metadata_filename and file_filename is specified. metadata_filename is preferred. file_filename is marked for deprecation.NzuThe file_filename kwarg will be deprecated in a future version of unstructured. Please use metadata_filename instead.rl    z}Only one of languages and ocr_languages should be specified. languages is preferred. ocr_languages is marked for deprecation.zmThe ocr_languages kwarg will be deprecated in a future version of unstructured. Please use languages instead.)rZ   rW   r_   ra   rm   zUThe headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.)rV   rX   rY   rW   r]   r   r#   )rV   rX   infer_table_structurerc   rd   r    r&   )rV   rX   r]   rc   rd   r4   )rV   rX   rc   rd   )rV   rX   r[   r]   rc   rd   )rV   rX   r]   rj   rc   rd   r(   )rV   rX   r[   rr   rc   rd   r,   )rV   rX   r[   rc   rd   r-   r1   r@   )rV   rX   rZ   r[   rr   r\   rc   extract_images_in_pdfextract_element_typesimage_output_dir_pathextract_to_payloadrn   )rV   rX   rZ   r[   rr   r\   rc   rn   )rV   rX   r]   r^   rc   rd   r.   rG   rD   )rV   rX   zDetected a JSON file that does not conform to the Unstructured schema. partition_json currently only processes serialized Unstructured output.rJ   r   r   zInvalid filezInvalid file z. The z) file type is not supported in partition. )4r   
ValueErrorr   warn
setdefaultr   warningfile_and_type_from_urlr   seekdecide_table_extractionr   DOCrR   DOCXODTEMLr   MSGHTMLr   XMLr   EPUBORGRSTMDPDFPNGJPGTIFFrB   TXTr   RTFPPTPPTXJSONr   r   XLSXXLSCSVTSVEMPTYmetadatarZ   data_sourcer   rO   r
   filetype)0rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   kwargsr   rr   _partition_docelements_partition_docx_partition_odt_partition_msg_partition_epub_partition_org_partition_rst_partition_md_partition_pdf_partition_rtf_partition_ppt_partition_pptx_partition_xlsx_partition_csv_partition_tsvr4   elementout_filetypes0                                                   r<   	partitionr   z   s&	   X TH#6666 
] 
W
 
 	

  )4	
 	
 	
 )+<===   S  
 ?}MMIN0  
 /%!+
 
 
hh b==N5   #+%
 
 
 		!3!  8<3E::!> 
"7(C
 
 
 
 
X]	"	"4V<<"? 
"7(C
 
 
 
 
X\	!	!3E::!> 
"7(C
 
 
 
 
X\	!	!" 
(C
 
 
 
 
X\	!	!3E::!> 
(C	
 

 
 
 
X]	"	"! 
 3(C
 
 
 
 
X\	!	!  
'(C
 
 
 
 
X]	"	"4V<<"? 
 3"7(C
 
 
 
 
X\	!	!3E::!> 
 3(C
 
 
 
 
X\	!	!3E::!> 
 3"7(C
 
 
 
 
X[	 	 2488 = 
 3"7(C
 
 
 
 
X\	!	!3E::!> 
 3"7"4";";5/=:
 
 
 
 hl
"
"HL(@(@hRZR_F_F_" 

 3"7/=:

 

 

 

 
X\	!	!! 
/(C
 
 
 
 
X\	!	!3E::!> 
 3"7(C
 
 
 
 
X\	!	!3E::!> 
 3"7(C
 
 
 
 
X]	"	"4V<<"? 
 3"7(C
 
 
 
 
X]	"	""H4@@@ 	Z   "I8$II&II
hm
#
#X\)A)A4V<<"? 
"7(C
 
 
 
 
X\	!	!3E::!> 
"7(C
 
 
 
 
X\	!	!3E::!> 
(C	
 

 
 
 
X^	#	#$,Lnn2L(2L2LCZZxZZZ[[[ 	G 	G"';$#*.|<<L6B6N$\22TX %% )=X(FG%%Or>   returnc                     t          j        | |||          }t          j        |j                  }|p|j                            d          }|j                            dd          }t          |||          }||fS )N)r_   verifytimeoutzContent-TypezContent-Encodingzutf-8)rX   rW   r]   )requestsrO   ioBytesIOcontentr_   r   )	rZ   rW   r_   ra   rm   responserX   r]   r   s	            r<   r|   r|     s     |C_]]]H:h&''DG8#3#7#7#G#GL##$6@@HD|hWWWH>r>   r   c                     | r| j                                         nd }|dk    r'||v r|rt          j        d| d| d           ||vp|S ||vS )Nr@   z3Conflict between variables skip_infer_table_types: z  and pdf_infer_table_structure: zK, please reset skip_infer_table_types to turn on table extraction for PDFs.)namelowerr   r{   )r   r`   re   rK   s       r<   r~   r~   '  s    
 )1:x}""$$$dH5---2K-N\F\ \ \2K\ \ \  
 55R9RR111r>   r9   )Sr   typingr   r   r   r   r   r   r   unstructured.documents.elementsr	    unstructured.file_utils.filetyper
   r   r   r   r   unstructured.loggerr   unstructured.partition.commonr   unstructured.partition.emailr   unstructured.partition.htmlr   unstructured.partition.jsonr   unstructured.partition.langr   unstructured.partition.textr   &unstructured.partition.utils.constantsr   unstructured.partition.xmlr   unstructured.utilsr   r   str__annotations__unstructured.partition.csvr   unstructured.partition.tsvr   unstructured.partition.docr!   unstructured.partition.docxr"   unstructured.partition.odtr%   unstructured.partition.epubr'   unstructured.partition.orgr)   unstructured.partition.rstr*   unstructured.partition.rtfr+   unstructured.partition.mdr0   unstructured.partition.msgr3   pdf_importsallunstructured.partition.pdfr?   unstructured.partition.imagerB   unstructured.partition.pptrE   unstructured.partition.pptxrF   unstructured.partition.xlsxrI   rR   AUTObytesboolintr   r   r|   r~   rw   r>   r<   <module>r      sx   					 < < < < < < < < < < < < < < < <  > > > > > >              ' & & & & & 5 5 5 5 5 5 8 8 8 8 8 8 6 6 6 6 6 6 6 6 6 6 6 6      7 6 6 6 6 6 D D D D D D 4 4 4 4 4 4 0 0 0 0 0 013 4X. 3 3 3X 5888888888888'4e$'4e$ V 7888888::::::'4e$(6f% V 5!2!2:!>!> 5888888'4e$ Z   7::::::(6f% Z   5888888888888888888'4e$'4e$'4e$ Z   3666666&2d# \"" 5888888'4e$ /..35555555 5888888'4e$ -.. 9<<<<<<)8g& V 7888888::::::'4e$(6f% X 7#4#4Z#@#@ 7::::::(6f%
 @D 'S(](;<   " #"& $#' %%*"8< (L(L(L#'%)(-&+$59/3#(9='+%)'+ $5Y YsmY3-Y 2e9
Y C=	Y
 
#Y Y Y smY  # 45Y #s(^Y !IY Y C=Y S	"Y "&Y   $!Y" #Y$  (S	2%Y&  (}'Y( !)Y* +Y, ##56-Y.  }/Y0 c]1Y2  }3Y4 5Y Y Y Y| #' %) 	3- #s(^ 	
 c] 2:x))*   "2x 2 I2  $2 
	2 2 2 2 2 2r>   