
    j:                     @   U d dl mZ d dlmZmZmZmZmZmZm	Z	m
Z
mZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZm Z m!Z! d dl"m#Z#m$Z$ d d	l%m&Z&m'Z'm(Z(m)Z) d d
l*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0 dZ1e2e3d<    e!             e$e#j4                   e            ddddddgddddf
dee2         dee
ee5         ef                  dee2         de6de6deee2                  de6dee2         de6de6dee         fd                                    Z7	 d2dej8        de6fdZ9d eee2ef                  deee2ef                  fd!Z:d"eeeef                  de	e
e;e<f         e
e;e<f         e
e;e<f         e
e;e<f         f         fd#Z=d$ej8        d%e	e;e;f         dej8        fd&Z>d'ee;         d(e	e;e;f         de	ee;         ee;         f         fd)Z?de	ee;         ee2         f         fd*Z@d+e2defd,ZA	 	 	 	 	 d3de6d.ee2         d/ee;         dee2         d0e
e2df         defd1ZBdS )4    )SpooledTemporaryFile)	IOAnyBinaryIODictListOptionalTupleUnioncastN)
fromstring)add_chunking_strategy)clean_bullets)ElementElementMetadataListItemNarrativeTextTableTextTitleprocess_metadata)FileTypeadd_metadata_with_filetype)exactly_oneget_last_modified_date get_last_modified_date_from_filespooled_to_bytes_io_if_needed)apply_lang_metadata)is_bulleted_textis_possible_narrative_textis_possible_numbered_listis_possible_titlexlsxDETECTION_ORIGINTautoFfilenamefilemetadata_filenameinclude_metadatainfer_table_structure	languagesdetect_language_per_elementmetadata_last_modifiedinclude_headerfind_subtablereturnc
           	      <   t          | |           d}|rdnd}| r't          j        | d|          }t          |           }n\|rZt	          t          t          t          t          f         |                    }t          j        |d|          }t          |          }g }d}|
                                D ]\  }}|dz  }|	s|r|                    d|d          nd}t          |                                          }|r%t          ||||p| |p|	          }t          |_        nt                      }t#          ||
          }|                    |           t'          |          }|D ]\  }}|\  }}}}|j        ||dz   ||dz   f         }t+          |          \  }} t-          ||j                  \  }!}"t1          ||||p| |p|          }|!8|"6t3          |!|z
            }#t3          ||"z
            }$t5          ||#|$f          }|!H| d|!dz            D ]:}%t7          t9          |%                    }&||&_        |                    |&           ;|[t=          |          dk    rHt7          t9          |j        d         j        d                             }&|                    |&           nw|u|                    d|d          }t          |                                          }t#          |          }||_        |r|nd|j        _         |                    |           |!J|"H| |!dz   d         D ]:}%t7          t9          |%                    }&||&_        |                    |&           ;tC          tE          |||                    }|S )a  Partitions Microsoft Excel Documents in .xlsx format into its document elements.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    include_metadata
        Determines whether or not metadata is included in the output.
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        User defined value for metadata.languages if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    metadata_last_modified
        The day of the last modification
    include_header
        Determines whether or not header info is included in text and medatada.text_as_html
    )r&   r'   Nr   )
sheet_nameheader   F )indexr3   na_rep)text_as_html	page_namepage_numberr&   last_modified)textmetadatar<   )elementsr+   r,   )#r   pd
read_excelr   r   r   r   r   r   r   itemsto_htmlsoupparser_fromstringtext_contentr   r$   detection_originr   append_get_connected_componentsiloc_single_non_empty_rows(_find_first_and_last_non_consecutive_rowshape_get_metadataint_get_sub_subtable_check_content_element_typestrr=   lenvaluesr8   listr   )'r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   kwargslast_modification_dater3   sheetsfr?   r:   r2   sheet	html_textr<   r=   table_connected_components_connected_component_min_max_coordsmin_xmin_ymax_xmax_ysubtablesingle_non_empty_rowssingle_non_empty_row_contentsfront_non_consecutivelast_non_consecutive	first_rowlast_rowcontentelements'                                          e/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/partition/xlsx.pypartition_xlsxrm   '   se   X ----! *QQdF 	HxDHHH!7!A!A	 H)x!556==
 
 qT&AAA!A$!G!G HK#\\^^ P1 P1
Eq N	1 )E.LLL 
 )33@@BBD 
-*!*( +.:("8"R<R   -=))*,,th777EOOE""""$=e$D$D!9N 61 615$o-<*ueU :eeai&79J&JKG]H HD%'D =)N )( )$%1*D.D  )49M9Y #$9E$A B BI"5+?#?@@H0Ix;PQQH(4#@A\CX[\C\A\#] 1 1"=c'll"K"K+3( 0000'CMMQ,>,>9#hmA>N>UVW>X:Y:YZZGOOG,,,,) ( 0 0u^\^ 0 _ _I0;;HHJJD$$///H(0H%BW5aYY]aH%2OOH---(49M9Y#@-133$ 1 1 #>c'll"K"K+3( 0000m61p (C	
 	
 	
 H O    rY   filterc           	         | j         \  }}t          j        ||          }t          j        ||f          j        }|                                 j        }d ||         D             }|                    |           t          j        |          }g }	|D ]@}
t          |
          }t          |          \  }}}}|	                    |||||d           A|rt          |	          }	d |	D             S )a  
    Identify connected components of non-empty cells in an excel sheet.

    Args:
        sheet: an excel sheet read in DataFrame.
        filter (bool, optional): If True (default), filters out overlapping components
        to return distinct components.

    Returns:
        A list of tuples, each containing:
            - A list of tuples representing the connected component's cell coordinates.
            - A tuple with the min and max x and y coordinates bounding the connected component.

    Note:
        This function performs a depth-first search (DFS) to identify connected components of
        non-empty cells in the sheet. If 'filter' is set to True, it also filters out
        overlapping components to return distinct components.
    c                 ,    g | ]}t          |          S  )tuple).0pairs     rl   
<listcomp>z-_get_connected_components.<locals>.<listcomp>   s    GGGtuT{{GGGrn   	componentr_   r`   ra   rb   c                 Z    g | ](}|d          |d         |d         |d         |d         ff)S rw   rr   )rt   connected_components     rl   rv   z-_get_connected_components.<locals>.<listcomp>   sY           ,#G,#G,#G,#G,		
  rn   )rL   nxgrid_2d_graphnpindicesTisnaremove_nodes_fromconnected_componentsrT   _find_min_max_coordrG   _filter_overlapping_tables)rY   ro   max_rowmax_colgraph
node_arrayempty_cellsnodes_to_removeconnected_components_as_nodesr   
_componentrx   r_   r`   ra   rb   s                   rl   rH   rH      s0   , {GW&w88EWg.//1J**,,.KGGz+/FGGGO	O,,,$&$;E$B$B!3 
 

$$	%8%C%C"ueU##& 	
 	
 	
 	
  P9:NOO  $8   rn   r   c                    t          | d           }g }d}|D ]}||}|d         |d         k    r|d                             |d                    t          |d         |d                   |d<   t          |d         |d                   |d<   t          |d         |d                   |d<   t          |d         |d                   |d<   |                    |           |}||                    |           |S )	zT
    Filter out overlapping connected components to return distinct components.
    c                     | d         S )Nr_   rr   )xs    rl   <lambda>z,_filter_overlapping_tables.<locals>.<lambda>  s
    1W: rn   )keyNr_   ra   rx   r`   rb   )sortedextendminmaxrG   )r   sorted_componentsmerged_componentscurrent_componentrx   s        rl   r   r      sD    39M9MNNN$&& . .	$ ) !%6w%???!+.55i6LMMM-01B71KYW^M_-`-`!'*-01B71KYW^M_-`-`!'*-01B71KYW^M_-`-`!'*-01B71KYW^M_-`-`!'** "(():;;;$-!!$  !2333rn   rz   c                     t          d          t          d          t          d          t          d          f\  }}}}| D ]%\  }}||k     r|}||k     r|}||k    r|}||k    r|}&||||fS )z[
    Find the minimum and maximum coordinates (bounding box) of a connected component.
    infz-inf)float)rz   r_   r`   ra   rb   _x_ys          rl   r   r     s     "'uuU||U6]]ERXMM!YE5%%  B::E::E::E::E%%%rn   rc   first_and_last_rowc                 @    |\  }}||k    rdS | j         ||dz            S )z]
    Extract a sub-subtable from a given subtable based on the first and last row range.
    Nr4   )rI   )rc   r   rh   ri   s       rl   rO   rO   /  s4     -Ix9t=X\122rn   row_indicestable_shapec                    |\  }}t          |           dk    st          |           |k    r|dk    r| d         | d         fS t          j        |           }t          d t	          t          ||dd                             D             d          }|ddd         }t          d t	          t          ||dd                             D             d          }||fS )z_
    Find the indices of the first and last non-consecutive rows in a list of row indices.
    r4   r   c              3   8   K   | ]\  }\  }}|d z   |k    |V  dS r4   Nrr   rt   ir   ys       rl   	<genexpr>z;_find_first_and_last_non_consecutive_row.<locals>.<genexpr>J  s2      GGyq&1aAEQJJJJJJGGrn   Nc              3   8   K   | ]\  }\  }}|d z
  |k    |V  dS r   rr   r   s       rl   r   z;_find_first_and_last_non_consecutive_row.<locals>.<genexpr>O  s4      YYyq&1aaRSeWXjjjjjjYYrn   )rR   r}   arraynext	enumeratezip)r   r   
table_rows
table_colsarrrf   reversed_arrrg   s           rl   rK   rK   <  s     )J

;1[!1!1Z!?!?JRSOO1~{1~--
(;

C GGIc#s122w&7&788GGG  ttt9LYYIc,QRR8H&I&IJJYYY  !"666rn   c                    g }g }|                                  D ]d\  }}|                                dk    rG|                    |           |                    |                                j        d                    e||fS )zb
    Identify single non-empty rows in a subtable and extract their row indices and contents.
    r4   r   )iterrowscountrG   dropnarI   )rc   rd   re   r6   rows        rl   rJ   rJ   U  s     $&!'')) G G
s99;;!!((///)001B11EFFF "???rn   r<   c                 4   t          |           rt          t          |                     S t          |           rt          |           S t	          |           rt          |           S t          |           rt          |           S t          |           S )zA
    Classify the type of content element based on its text.
    r>   )	r   r   r   r!   r    r   r"   r   r   r>   s    rl   rP   rP   b  s      
t$$
 
 
 	
 
#4	(	( 

 
 
 	
 
$D	)	) 

 
 
 	
 
4	 	  

 
 
 	
 
 
 
 	
rn   r   r2   r:   rV   c                 N    | rt          ||||          }nt                      }|S )z5Returns metadata depending on `include_metadata` flag)r9   r:   r&   r;   )r   )r)   r2   r:   r&   rV   r=   s         rl   rM   rM   |  s@      %" #0	
 
 
 #$$Orn   )T)TNr   NN)Ctempfiler   typingr   r   r   r   r   r	   r
   r   r   networkxr{   numpyr}   pandasr@   lxml.html.soupparserr   rD   unstructured.chunkingr   unstructured.cleaners.corer   unstructured.documents.elementsr   r   r   r   r   r   r   r    unstructured.file_utils.filetyper   r   unstructured.partition.commonr   r   r   r   unstructured.partition.langr    unstructured.partition.text_typer   r    r!   r"   r$   rQ   __annotations__XLSXbytesboolrm   	DataFramerH   r   rN   r   r   rO   rK   rJ   rP   rM   rr   rn   rl   <module>r      s"   ) ) ) ) ) ) ) N N N N N N N N N N N N N N N N N N N N N N             D D D D D D 7 7 7 7 7 7 4 4 4 4 4 4	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 R Q Q Q Q Q Q Q            < ; ; ; ; ;             #    HM**"=A'+!"&&,X(-,0 T TsmT
5E$889
:T  }T 	T
  T S	"T "&T %SMT T T 
']T T T  +* Tr 8 8<88 8 8 8vtCH~.	$sCx.   @&d38n-&
5eeCJ/sEz1BE#u*DUUV& & & &&
3 
3%S/ 
3VXVb 
3 
3 
3 
37c7sCx7 8C=(3-'(7 7 7 72
@d3ic.B(C 
@ 
@ 
@ 
@
c 
g 
 
 
 
6 " $!#"/3  # sm	
 "#t),      rn   