
    jW                       d dl mZ d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
mZ d dlmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZmZ d d	lmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d d
l&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z- d dl.m/Z/m0Z0  e/d          r e/d          rd dl1m2Z3  e/d          r e/d          rd dl4m2Z5  e/d          r e/d          rd dl6m7Z7 erd dl8m9Z9m:Z: d dl;m<Z< g dg ddZ=dodZ>dpd"Z?	 	 	 dqdrd.Z@ee          eeA         fdsd3ZBe=fdtd7ZC	 	 	 	 	 	 	 	 	 	 	 dudvdDZDdwdEZE	 	 dxdydJZFdzdNZGd{dRZH	 d|d}dUZI	 d~ddYZJdd[ZKdd_ZLd`d`dd#dde-dfddgZM	 	 	 dddnZNdS )    )annotationsN)datetime)BufferedReaderBytesIOTextIOWrapper)SpooledTemporaryFile)	IOTYPE_CHECKINGAnyBinaryIODictListOptionalTupleUnion)tabulate)CoordinateSystem
PixelSpace)
TYPE_TO_TEXT_ELEMENT_MAPCheckBoxCoordinatesMetadataElementElementMetadataElementTypeListItem	PageBreakTextTitle)logger)ENUMERATED_BULLETS_REUNICODE_BULLETS_RE)SORT_MODE_DONTSORT_MODE_XY_CUT)dependency_existsfirstdocxz
docx.table)Tablepptxz
pptx.tablenumpycv2)sort_page_elements)DocumentLayout
PageLayout)LayoutElement)	r   UncategorizedTextNarrativeTextr   BulletedTextr'   FigureCaptionr   r'   )
r   r   r/   r0   r   r1   r'   r2   r   r'   )r   HeaderfilenamestrreturnUnion[str, None]c                    t          j        t          j                            |                     }|                    d          S )Nz%Y-%m-%dT%H:%M:%S%z)r   fromtimestampospathgetmtimestrftime)r4   modify_dates     g/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/partition/common.pyget_last_modified_dater@   S   s7    ()9)9()C)CDDK 5666    file>Union[IO[bytes], SpooledTemporaryFile[bytes], BinaryIO, bytes]c                ^    d }t          | d          r| j        }|sd S t          |          }|S )Nname)hasattrrE   r@   )rB   r4   r>   s      r?    get_last_modified_date_from_filerG   X   sB     HtV 9 t(22KrA   Thtmllayout_element/Union['LayoutElement', Element, Dict[str, Any]]coordinate_systemOptional[CoordinateSystem]infer_list_itemsboolsource_formatOptional[str]Union[Element, List[Element]]c                l   t          | t                    r|dk    r| S t          | t                    rt          d          S t          | t                    s|                                 }n| }|                    d          }|                    d          }|                    d          }|                    d          }d|v r|d         nd	}	d	}
|	r|	j        }
|rKt          |t          t          t          t          j        f          rt          t          |          
          }nt                      }|t          j        k    r.|rt          |||||
          S t!          |r|nd||||
          S |t"          v rXt"          |         } ||||||
          }|t          j        k    rd|j        _        n|t          j        k    rd|j        _        |S |t          j        k    rt/          d||||
          S |t          j        k    rt/          d||||
          S t3          |r|nd||||
          S )zSConverts an unstructured_inference LayoutElement object to an unstructured Element.rH    textrU   coordinatestypeprobsourceN)detection_class_prob)rV   rK   metadatadetection_originrU   rV   rK   r[   r\         T)checkedrV   rK   r[   r\   F)
isinstancer   r   dictto_dictgetvalueintr5   floatnumbersNumberr   r   LISTlayout_list_to_list_itemsr   r   HEADLINEr[   category_depthSUB_HEADLINECHECKEDr   	UNCHECKEDr   )rI   rK   rM   rO   layout_dictrU   rV   element_typerX   
aux_originoriginclass_prob_metadata_element_classs                r?   normalize_layout_elementrw   f   s    .'** }/F/F .),, "b!!!!nd++ %$,,..$??6""D //-00K??6**L??6""D*2k*A*AX&&tJF "! 0
4#sE7>!BCC 0-5;;OOO-//{''' 	,'"3,!'    !)TTr'"3,!'    
1	1	11,?'#/(#
 
 
 ;///56N#22[55556N#2	,	,	,#/(#
 
 
 	
 
.	.	.#/(#
 
 
 	
 %2#/(#
 
 
 	
rA   rU   rV   )Optional[Tuple[Tuple[float, float], ...]]List[Element]c                Z   | rt          j        |           ng }t          |          dk    r| rt          j        |           ng }g }|D ]b}t          |                                          dk    r;t          |                                ||||          }|                    |           c|S )z=Converts a list LayoutElement to a list of ListItem elements.r^   r   r]   )r    splitlenr!   stripr   append)	rU   rV   rK   r[   r\   split_items
list_itemstext_segmentitems	            r?   rk   rk      s     8<C'-d333K
;18<D(.t444" "J# $ $|!!##$$q(( !''))'"3!!1  D d###rA   elementsrulesetDict[str, List[str]]c                   g }| D ]}|j         j        d}t          |dd          }t          |j         dd          pd}|s=|rx|d         }t          |d          }t          |j         dd          pd}	||k    r|	|k     s||k    r ||                    |g           v r|j        }n|                                 |x||j         _        |                    |           | S )zzSets the parent_id for each element in the list of elements
    based on the element's category, depth and a ruleset

    Ncategoryrm   r   )r[   	parent_idgetattrrd   idpopr~   )
r   r   stackelementr   element_categoryelement_category_depthtop_elementtop_element_categorytop_element_category_depths
             r?   set_element_hierarchyr      s6    E # #%1	"7J==!()9;KQ!O!O!TST 	 	#(9K#*;
#C#C ($  
  ' %(888.1GGG$(888$4H"(M(MMM'N	IIKKK-  	0 &/"WOrA   r   r   filetypepage_numberOptional[int]urltext_as_htmlsection
image_pathr\   	languagesOptional[List[str]]c                6   ||t          ||          nd}t          | d          rt          | j                  dk    r| j        nd}|rd |D             nd}|rd |D             nd}t          | d          rt          | j                  dk    r| j        nd}|rd |D             nd}|rd	 |D             nd}| j        j        r| j        j        nd}t          |||||||||||||	|
          }| j                            |           |
|
| j        _	        | S )zAdds document metadata to the document element. Document metadata includes information
    like the filename, source url, and page number.N)pointssystemlinksr   c                8    g | ]}|                     d           S )r   rd   .0links     r?   
<listcomp>z)_add_element_metadata.<locals>.<listcomp>0  s"    333T%333rA   c                8    g | ]}|                     d           S rT   r   r   s     r?   r   z)_add_element_metadata.<locals>.<listcomp>1  s$    555t$((6""555rA   emphasized_textsc                8    g | ]}|                     d           S rT   r   r   emphasized_texts     r?   r   z)_add_element_metadata.<locals>.<listcomp>8  s&    MMM		V	$	$MMMrA   c                8    g | ]}|                     d           S )tagr   r   s     r?   r   z)_add_element_metadata.<locals>.<listcomp>=  s&    LLL		U	#	#LLLrA   )rV   r4   r   r   r   r   	link_urls
link_textsemphasized_text_contentsemphasized_text_tagsr   rm   r   r   )
r   rF   r|   r   r   r[   rm   r   updater\   )r   r4   r   r   r   r   rV   rK   r   r   r\   r   kwargscoordinates_metadatar   r   r   r   r   r   depthr[   s                         r?   _add_element_metadatar     s   . "'8'D	 	$	
 	
 	
 	

   %Wg66[3w};M;MPQ;Q;QGMMW[E7<F33U3333$I9>H55u5555DJ 7.//	478P4Q4QTU4U4U 	    	MM<LMMMM  	LL;KLLLL 
 07/?/NXG++TXE(!!91  H  H%%%#,<)NrA   c                    g }t                      }| D ]d}t          |          }t          |t                    r"|D ]	}||_        
|                    |           H||_        |                    |           e|S )zRemoves document metadata from the document element. Document metadata includes information
    like the filename, source url, and page number.)r   rw   ra   listr[   extendr~   )layout_elementsr   r[   rI   r   _elements         r?   _remove_element_metadatar   Y  s     !H  H) % %*>::gt$$ 	%# - -$,!!OOG$$$$'GOOG$$$$OrA   input_filenameoutput_directorytarget_formattarget_filterc                   || d| }ddd|d|| g}	 t          j        |t           j        t           j                  }|                                \  }}n# t          $ r t	          d          w xY wt          j        |                                                                           |r:t          j	        |                                                                           dS dS )	a  Converts a .doc file to a .docx file using the libreoffice CLI.

    Parameters
    ----------
    input_filename: str
        The name of the .doc file to convert to .docx
    output_directory: str
        The output directory for the convert .docx file
    target_format: str
        The desired output format
    target_filter: str
        The output filter name to use when converting. See references below
        for details.

    References
    ----------
    https://stackoverflow.com/questions/52277264/convert-doc-to-docx-using-soffice-not-working
    https://git.libreoffice.org/core/+/refs/heads/master/filter/source/config/fragments/filters

    N:sofficez
--headlessz--convert-toz--outdir)stdoutstderra  soffice command was not found. Please install libreoffice
on your system and try again.

- Install instructions: https://www.libreoffice.org/get-help/install-howto/
- Mac: https://formulae.brew.sh/cask/libreoffice
- Debian: https://wiki.debian.org/LibreOffice)

subprocessPopenPIPEcommunicateFileNotFoundErrorr   infodecoder}   error)r   r   r   r   commandprocessoutputr   s           r?   convert_office_docr   m  s   4  (::=:: 	G
"??
 
 

  ++-- 
 
 
1
 
 	

 K%%''((( -U\\^^))++,,,,,- -s   AA A1r   r   Nonec                 V   t          d |                                 D                       dk    ryt          |                                           }t	          |          dk    r+dd                    |dd                    d|d          d}n|d	          d}t          |          dS )
z
    Verify arguments; exactly one of all keyword arguments must not be None.

    Example:
        >>> exactly_one(filename=filename, file=file, text=text, url=url)
    c                "    g | ]}|d uo|dk    S )NrS    )r   args     r?   r   zexactly_one.<locals>.<listcomp>  s%    GGGS_*GGGrA   r^   zExactly one of z, Nr   z and z must be specified.r   )sumvaluesr   keysr|   join
ValueError)r   namesmessages      r?   exactly_oner     s     GGv}}GGGHHAMMV[[]]##u::>>b		%*(=(=bbE"IbbbGGq666G!!! NMrA   file_obj=Optional[Union[bytes, BinaryIO, SpooledTemporaryFile[bytes]]] Optional[Union[bytes, BinaryIO]]c                    t          | t                    r8|                     d           |                                 }t	          |          S | S )Nr   )ra   r   seekreadr   )r   contentss     r?   spooled_to_bytes_io_if_neededr     sK     (011 a==??x    rA   7Optional[Union[bytes, SpooledTemporaryFile, IO[bytes]]]bytesc                   t          | t                    r| }nt          | t                    r?|                     d           |                                 }|                     d           nt          | t
                    r|                                 }nmt          | t          t          f          rBt          | j
        d          5 }|                                }d d d            n# 1 swxY w Y   nt          d          |S )Nr   rbzInvalid file-like object type)ra   r   r   r   r   r   getvaluer   r   openrE   r   )rB   f_bytesfs      r?   convert_to_bytesr     s    $ :	D.	/	/ 
:		!))++		!	D'	"	" :--//	D=.9	:	: :$)T"" 	affhhG	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 8999Ns   C))C-0C-tableUnion['docxtable', 'pptxtable']as_htmlc                    |rdnd}t          | j                  }t          |          dk    r>d |d         j        D             }d |dd         D             }t	          |||          }nd	}|S )
al  
    Convert a table object from a Word document to an HTML table string using the tabulate library.

    Args:
        table (Table): A docx.table.Table object.
        as_html (bool): Whether to return the table as an HTML string (True) or a
            plain text string (False)

    Returns:
        str: An table string representation of the input table.
    rH   plainr   c                    g | ]	}|j         
S r   rT   r   cells     r?   r   z3convert_ms_office_table_to_text.<locals>.<listcomp>  s    77749777rA   c                0    g | ]}d  |j         D             S )c                    g | ]	}|j         
S r   rT   r   s     r?   r   z>convert_ms_office_table_to_text.<locals>.<listcomp>.<listcomp>  s    111t111rA   )cells)r   rows     r?   r   z3convert_ms_office_table_to_text.<locals>.<listcomp>  s)    FFFc11sy111FFFrA   r^   N)headerstablefmtrS   )r   rowsr|   r   r   )r   r   fmtr   r   data
table_texts          r?   convert_ms_office_table_to_textr    s     
(&&C
D
4yy1}}77a777FFT!""XFFFdGcBBB


rA   sc                D    t          t          j        |                     S )z
    Check if the input string contains any emoji characters.

    Parameters:
    - s (str): The input string to check.

    Returns:
    - bool: True if the string contains any emoji, False otherwise.
    )rN   emojiemoji_count)r  s    r?   contains_emojir    s     !!$$%%%rA   pager-   rb   c                   t          | dd          }t          | dd          }|r|j        }|j        }|j        }nH|r@|                    d          }|                    d          }|                    d          }nd}d}d}|||dS )z:Retrieve image metadata and coordinate system from a page.imageNimage_metadataformatwidthheight)r  r  r  )r   r  r  r  rd   )r	  r  r  image_formatimage_widthimage_heights         r?   _get_page_image_metadatar    s    
 D'4((ET#3T::N |k|	 %))(33$((11%))(33   rA   Fdocument'DocumentLayout'sortableinclude_page_breakslast_modification_date	sort_modec	                8   g }
t          | j                  }t          | j                  D ]\  }}g }t          |          }|                    d          }|                    d          }|                    d          }g }|j        D ]ފ|r)|r't          j        d          rt          ||          }nd}t          |||r|nd          }t          |t                    rY|D ]}|r||j        _        |d	z   |j        _         |                    |           |                    fd
|D                        |r||j        _        t          d          rj        nd|j        _        	 t          |t"                    r6|j        j        *t'          d |j        D                       rd|j        _        n$# t(          $ r t+          j        d           Y nw xY w|                    |           |                    |f           |j        j        r|j        j        j        nd}t          d          rj        nd}t7          |f|d	z   ||||j        j        |||d|	 |D ]H\  }t          d          r3j        ,t;          fd|D                       }|j        |j        _        I|}|r|t@          k    rtC          ||          }|r,||d	z
  k     r#|                    tE          d                     |
                    |           |
S )zDConverts a DocumentLayout object to a list of unstructured elements.r  r  r  rV   r  r  NrH   rK   rM   rO   r^   c                    g | ]}|fS r   r   )r   elrI   s     r?   r   z,document_to_element_list.<locals>.<listcomp>K  s    +S+S+SR^R,@+S+S+SrA   r   c              3  (   K   | ]}|j         d v V  dS ))HeadlineSubheadlineN)rW   )r   r  s     r?   	<genexpr>z+document_to_element_list.<locals>.<genexpr>V  s*      ]]bg)DD]]]]]]rA   r   z+HTML element instance has no attribute typer   )r   r   rV   rK   rm   r   r\   r   parentc              3  4   K   | ]\  }}|j         u |V  d S N)r#  )r   l_elr  rI   s      r?   r"  z+document_to_element_list.<locals>.<genexpr>u  s4      ]]HD"t~G\?\?\R?\?\?\?\]]rA   rS   rT   )#r|   pages	enumerater  rd   r   rF   bboxr   rw   ra   r   r[   last_modifiedr   r   r   r   rm   anyAttributeErrorr   r   r~   rV   r   r   r   r#  r%   r   r   r"   r+   r   )r  r  r  r  rM   rO   r\   r  r   r   r   	num_pagesir	  page_elementspage_image_metadatar  r  r  translation_mappingrK   r   r  rV   el_image_pathelement_parentsorted_page_elementsrI   s                              @r?   document_to_element_listr5     s    !HHN##IX^,, N. N.4')6t<<*..x88)--g66*..x88EG"m 7	 7	N )| )8K]0[0[ )$.[$V$V$V!!$(!."3!1/<Hmm&	  G '4(( F! 4 4B- K4J1./!eBK++$$W---#**+S+S+S+S7+S+S+STTT) L5KG$23:>>3Z3ZdN//`d  -O"7E22<7>7G7V7^]]t}]]]]] 8_:;(7% O O OK MNNNNNO $$W---#**NG+DEEE7>7G7S] ,33Y] 
 .5^\-R-R\))X\  "E%'"3&/>(!1#      (; 	? 	?#NG~x00 ?^5J5V!&]]]](;]]]" " .<-> *, 	P	^33#5mY#O#O  	<1y1}#4#4 ''	r(:(:(:;;;,----Os   2AF>>GGocr_dataList['LayoutElement']
image_size+Tuple[Union[int, float], Union[int, float]]common_metadataOptional[ElementMetadata]c                    |\  }}t          ||          }g }| D ]J}	t          |	|||r|nd          }
|r|
j                            |           |                    |
           K|S )zNConvert OCR layout data into `unstructured` elements with associated metadata.r  rH   r  )r   rw   r[   r   r~   )r6  r8  r:  rM   rO   r  r  rK   r   rI   r   s              r?   ocr_data_to_elementsr=    s     !+K"\JJJH" ! !*/-+8D--f	
 
 
  	5##O444    OrA   )r4   r5   r6   r7   )rB   rC   r6   r7   )NTrH   )
rI   rJ   rK   rL   rM   rN   rO   rP   r6   rQ   )rU   rP   rV   rx   rK   rL   r6   ry   )r   ry   r   r   r6   ry   )NNNNNNNNNNN)r   r   r4   rP   r   rP   r   r   r   rP   r   rP   rV   rx   rK   rL   r   rP   r   rP   r\   rP   r   r   r6   r   )r6   ry   )r&   N)r   r5   r   r5   r   r5   r   rP   )r   r   r6   r   )r   r   r6   r   r%  )rB   r   r6   r   )T)r   r   r   rN   r6   r5   )r  r5   r6   rN   )r	  r-   r6   rb   )r  r  r  rN   r  rN   r  rP   rM   rN   rO   rP   r\   rP   r  r5   r   r   r6   ry   )NTN)r6  r7  r8  r9  r:  r;  rM   rN   rO   rP   r6   ry   )O
__future__r   rh   r:   r   r   ior   r   r   tempfiler   typingr	   r
   r   r   r   r   r   r   r   r  r   "unstructured.documents.coordinatesr   r   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   r   unstructured.loggerr   unstructured.nlp.patternsr    r!   &unstructured.partition.utils.constantsr"   r#   unstructured.utilsr$   r%   
docx.tabler'   	docxtable
pptx.table	pptxtable$unstructured.partition.utils.sortingr+   'unstructured_inference.inference.layoutr,   r-   .unstructured_inference.inference.layoutelementr.   HIERARCHY_RULE_SETr@   rG   rw   r5   rk   r   r   r   r   r   r   r   r  r  r  r5  r=  r   rA   r?   <module>rP     s   " " " " " "  				           5 5 5 5 5 5 5 5 5 5 ) ) ) ) ) )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
        K K K K K K K K                        ' & & & & & O O O O O O O O S S S S S S S S 7 7 7 7 7 7 7 7V .!2!2<!@!@ .------V .!2!2<!@!@ .------W H"3"3E":": HGGGGGG MRRRRRRRRLLLLLL
 
 
    67 7 7 7
   ( 59!#)]
 ]
 ]
 ]
 ]
H o&c]    @ %7. . . . .f #"!%"&=A48! $&*%)A A A A AH   .  #'	<- <- <- <- <-~" " " " 	 	 	 	 EI    ,     4& & & &   <  %,0!#'&*%%)` ` ` ` `L 26!#'      rA   