
    j?                    ,   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZ d d
l m!Z" d dl m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m!Z!m?Z?m@Z@mAZA d dlBmCZCmDZD d dlEmFZFmGZGmHZH d dlImJZJ d dlKmLZLmMZMmNZNmOZOmPZP d dlQmRZRmSZSmTZT  eRd          rd dlUZUdZVdeWd<   eeef         ZXdeWd<   ee*e"f         ZYdeWd <    eTd          ddd!d!ddd"gd#fd;d3            ZZ eA             eDeCj[                   e1            dddd!d!d!ddd"gd#f
d<d8                                    Z\ G d9 d:          Z]dS )=    )annotationsN)SpooledTemporaryFile)
IOAnyDictIteratorListOptionalTupleTypeUnioncast)Document)WD_SECTION_START)CT_Tbl)CT_P)Section_Footer_Header)Table)_Cell_Row)	Hyperlink)RenderedPageBreak)	Paragraph)Run)tabulate)	TypeAlias)add_chunking_strategy)clean_bullets)AddressElementElementMetadataEmailAddressFooterHeaderLinkListItemNarrativeText	PageBreakr   TextTitleprocess_metadata)FileTypeadd_metadata_with_filetype)exactly_oneget_last_modified_date get_last_modified_date_from_file)apply_lang_metadata)is_bulleted_textis_email_addressis_possible_narrative_textis_possible_titleis_us_city_state_zip)dependency_existslazypropertyrequires_dependenciespypandocdocxstrDETECTION_ORIGINr   BlockElement	BlockItemTautoFsource_formatfilenameOptional[str]fileOptional[IO[bytes]]include_metadataboolinfer_table_structuremetadata_filenamemetadata_last_modified	languagesOptional[List[str]]detect_language_per_elementreturnList[Element]c	           
        t          ||           dd}	dd}
dd
}|r |	|          n( |
t          t          t                   |                    }t	          j                    5 }t          j                            | ||                    }t          j
        |d| |           t          |||||||          }ddd           n# 1 swxY w Y   |S )a  Converts a document to DOCX and then partitions it using partition_docx.

    Works with any file format support by pandoc.

    Parameters
    ----------
    source_format
        The format of the source document, .e.g. odt
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    include_metadata
        Determines whether or not metadata is included in the metadata attribute on the elements in
        the output.
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    rD   rF   rD   r>   rP   c                j    t           j                            |           st          d|  d          | S )z;Return path to a file confirmed to exist on the filesystem.z	The file z does not exist.)ospathexists
ValueError)rD   s    e/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/partition/docx.pyvalidate_filenamez5convert_and_partition_docx.<locals>.validate_filename}   s8    w~~h'' 	ECCCCDDD    rF   	IO[bytes]c                    t          j        d          5 }|                    |                                            |j        cddd           S # 1 swxY w Y   dS )z6Return path to temporary copy of file to be converted.F)deleteN)tempfileNamedTemporaryFilewritereadname)rF   tmps     rY   copy_to_tempfilez4convert_and_partition_docx.<locals>.copy_to_tempfile   s    (666 	#IIdiikk"""8	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   .AAA	file_pathc                    t           j                            |           }t           j                            |          \  }}| dS )zAReturn a filename like "foo.docx" from a path like "a/b/foo.odt" z.docx)rU   rV   basenamesplitext)rf   rD   	root_name_s       rY   extract_docx_filenamez9convert_and_partition_docx.<locals>.extract_docx_filename   sA     7##I..w''11	1""""r[   r=   )format
outputfile)rD   rK   rH   rJ   rL   rM   rO   N)rD   r>   rP   r>   )rF   r\   rP   r>   )rf   r>   rP   r>   )r0   r   r   bytesr_   TemporaryDirectoryrU   rV   joinr<   convert_filepartition_docx)rC   rD   rF   rH   rJ   rK   rL   rM   rO   rZ   re   rl   rf   tmpdir	docx_pathelementss                   rY   convert_and_partition_docxrw   R   si   R ----      # # # # 08d!!(+++=M=MdSUV[S\^bNcNc=d=dI		$	&	& 
&GLL)>)>y)I)IJJ	  		
 	
 	
 	
 "/-"7#9(C
 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
$ Os   'ACCCinclude_page_breakschunking_strategykwargsr   c
                    t          | |           t                              | |||||          }t          |||	          }t	          |          S )a.  Partitions Microsoft Word Documents in .docx format into its document elements.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    metadata_filename
        The filename to use for the metadata. Relevant because partition_doc converts the document
        to .docx before partition. We want the original source filename in the metadata.
    metadata_last_modified
        The last modified date for the document.
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    rS   )rv   rM   rO   )r0   _DocxPartitioneriter_document_elementsr3   list)rD   rF   rK   rx   rH   rJ   rL   ry   rM   rO   rz   rv   s               rY   rs   rs      sl    X ----66 H #$?  H
 >>r[   c                     e Zd ZdZ	 	 	 	 	 	 dIdJdZe	 	 	 	 	 	 dIdKd            ZdLdZdLdZdMdZ	dNdOdZ
edPd            ZedQd            ZedQd             ZdRd#ZdSd%ZdTd&ZdMd'ZdUd)ZdVd-ZdWd/ZdXd2ZdYd4ZdZd5Zd[d7Zed\d8            Zed]d:            Zd^d<Zd_d>Zd`d@ZdadAZ dbdBZ!dcdDZ"dddFZ#dddGZ$dedHZ%dS )fr|   z8Provides `.partition()` for MS-Word 2007+ (.docx) files.NTrD   rE   rF   rG   rK   rx   rI   rJ   rL   rP   Nonec                h    || _         || _        || _        || _        || _        || _        d| _        d S )N   )	_filename_file_metadata_filename_include_page_breaks_infer_table_structure_metadata_last_modified_page_counter)selfrD   rF   rK   rx   rJ   rL   s          rY   __init__z_DocxPartitioner.__init__   s@     "
"3$7!&;#'=$"#r[   Iterator[Element]c                     | ||||||          }|j         r|                                n|                                S )zFPartition MS Word documents (.docx format) into its document elements.)rD   rF   rK   rx   rJ   rL   )_document_contains_sections_iter_document_elements#_iter_sectionless_document_elements)clsrD   rF   rK   rx   rJ   rL   r   s           rY   r}   z'_DocxPartitioner.iter_document_elements   s^     s/ 3"7#9
 
 
 /<D((***99;;	
r[   c              #    K   t          | j        j                  D ]\  }}|                     ||          E d{V  |                     |          E d{V  |                                D ]c}t          |t                    r|                     |          E d{V  3t          |t                    r| 
                    |          E d{V  d|                     |          E d{V  dS )zFGenerate each document-element in (docx) `document` in document order.N)	enumerate	_documentsections_iter_section_page_breaks_iter_section_headersiter_inner_content
isinstancer   _iter_paragraph_elements	DocxTable_iter_table_element_iter_section_footers)r   section_idxsection
block_items       rY   r   z(_DocxPartitioner._iter_document_elements  sF      %.dn.E$F$F 	; 	; K55k7KKKKKKKKK11':::::::::%88:: D D
 j)44 D#<<ZHHHHHHHHHH	  D  $77
CCCCCCCCC11'::::::::::	; 	;r[   c              #    K   | j                                         D ]c}t          |t                    r|                     |          E d{V  3t          |t
                    r|                     |          E d{V  ddS )zGenerate each document-element in a docx `document` that has no sections.

        A "section-less" DOCX must be iterated differently. Also it will have no headers or footers
        (because those live in a section).
        N)r   r   r   r   r   r   r   )r   r   s     rY   r   z4_DocxPartitioner._iter_sectionless_document_elements9  s       .;;== 	@ 	@J*i00 @88DDDDDDDDDDJ	22 @33J?????????	@ 	@r[   	paragraphr   c              #    K   |j         }|                                sdS |                     |          }|                     |          r>t	          |                                          }|rt          ||t                    V  dS |                     |          }|r |||t                    V  dS |                     |          }|r |||t                    V  dS t          ||t                    V  dS )a  Generate zero-or-one document element for `paragraph`.

        In Word, an empty paragraph is commonly used for inter-paragraph spacing. An empty paragraph
        does not contribute to the document-element stream and will not cause an element to be
        emitted.
        N)textmetadatadetection_origin)r   r   )
r   strip_paragraph_metadata_is_list_itemr    r(   r?   _style_based_element_type&_parse_paragraph_text_for_element_typer+   )r   r   r   r   
clean_text
TextSubClss         rY   _classify_paragraph_to_elementz/_DocxPartitioner._classify_paragraph_to_elementF  sG      ~ zz|| 	F++I66 i(( 	&t,,2244J #%%5     
 F 33I>>
 	*$L\]]]]]]F @@KK
 	*$L\]]]]]]F 4(=MNNNNNNNNr[   Ftabler   	is_nestedr>   c                n     d fddfdt          fd	|j        D             |rg nd
d          S )a  HTML string version of `table`.

        Example:

            <table>
            <tbody>
            <tr><th>item  </th><th style="text-align: right;">  qty</th></tr>
            <tr><td>spam  </td><td style="text-align: right;">   42</td></tr>
            <tr><td>eggs  </td><td style="text-align: right;">  451</td></tr>
            <tr><td>bacon </td><td style="text-align: right;">    0</td></tr>
            </tbody>
            </table>

        `is_nested` is used for recursive calls when a nested table is encountered. Certain
        behaviors are different in that case, but the caller can safely ignore that parameter and
        allow it to take its default value.
        cellr   rP   Iterator[str]c              3     K   |                                  D ]b}t          |t                    rt          j        |j                   V  4t          |t                    r                    |d          V  cd S )NT)r   )r   r   r   htmlescaper   r   _convert_table_to_htmlr   r   r   s     rY   iter_cell_block_itemszF_DocxPartitioner._convert_table_to_html.<locals>.iter_cell_block_items  s      "5577 R R
j)44 R "[99;;;;;	  R 55jD5QQQQQR Rr[   rowr   c                *    fd| j         D             S )Nc              3  T   K   | ]"}d                       |                    V  #dS )
Nrq   ).0r   r   s     rY   	<genexpr>zN_DocxPartitioner._convert_table_to_html.<locals>.iter_cells.<locals>.<genexpr>  s;      QQtDII33D99::QQQQQQr[   )cells)r   r   s    rY   
iter_cellsz;_DocxPartitioner._convert_table_to_html.<locals>.iter_cells  s    QQQQsyQQQQr[   c                @    g | ]}t           |                    S  )r~   )r   r   r   s     rY   
<listcomp>z;_DocxPartitioner._convert_table_to_html.<locals>.<listcomp>  s)    999sT**S//""999r[   firstrow
unsafehtml)headerstablefmtr   r   rP   r   )r   r   rP   r   )r   rows)r   r   r   r   r   s   `  @@rY   r   z'_DocxPartitioner._convert_table_to_htmlq  s    &		R 		R 		R 		R 		R 		R	R 	R 	R 	R 	R 	R 9999ej999#3BB "
 
 
 	
r[   r   c                   | j         | j        }}|t          j        |          S |J t	          |t
                    r;|                    d           t          j        |	                                          }t          j        |          S )z?The python-docx `Document` object loaded from file or filename.Nr   )
r   r   r=   r   r   r   seekioBytesIOrb   )r   rD   rF   s      rY   r   z_DocxPartitioner._document  s|     $=***d011 	+IIaLLL:diikk**D}T"""r[   c                ^    d}t          | j        j                            |                    S )a>  True when there is at least one page-break detected in the document.

        Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably
        inserted by Microsoft Word, but probably don't appear in documents converted into .docx
        format from for example .odt format.
        z./w:body/w:p/w:r/w:lastRenderedPageBreak | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak)rI   r   elementxpath)r   r   s     rY   _document_contains_pagebreaksz._DocxPartitioner._document_contains_pagebreaks  s0    V 	 DN*0077888r[   c                4    t          | j        j                  S )a2  True when there is at least one section in the document.

        This is always true for a document produced by Word, but may not always be the case when the
        document results from conversion or export. In particular, a Microsoft Teams chat-transcript
        export will have no sections.
        )rI   r   r   r   s    rY   r   z,_DocxPartitioner._document_contains_sections  s     DN+,,,r[   hdrftr_Header | _Footerc                `     d fd}d                     d  ||          D                       S )	am  The text enclosed in `hdrftr` as a single string.

        Each paragraph is included along with the text of each table cell. Empty text is omitted.
        Each paragraph text-item is separated by a newline ("
") although note that a paragraph
        that contains a line-break will also include a newline representing that line-break, so
        newlines do not necessarily distinguish separate paragraphs.

        The entire text of a table is included as a single string with a space separating the text
        of each cell.

        A header with no text or only whitespace returns the empty string ("").
        r   r   rP   r   c              3    K   |                                  D ]r}t          |t                    r|j                                        V  3t          |t
                    r*d                                        |                    V  sdS )zGenerate each text item in `hdrftr` stripped of leading and trailing whitespace.

            This includes paragraphs as well as table cell contents.
             N)r   r   r   r   r   r   rq   _iter_table_texts)r   r   r   s     rY   iter_hdrftr_textsz?_DocxPartitioner._header_footer_text.<locals>.iter_hdrftr_texts  s      
 %7799 G G
j)44 G$///111111	  G ((4#9#9*#E#EFFFFFG Gr[   r   c              3     K   | ]}||V  	d S Nr   r   r   s     rY   r   z7_DocxPartitioner._header_footer_text.<locals>.<genexpr>  s'      LL$tLLLLLLLr[   )r   r   rP   r   r   )r   r   r   s   `  rY   _header_footer_textz$_DocxPartitioner._header_footer_text  sV    	G 	G 	G 	G 	G 	G yyLL*;*;F*C*CLLLLLLr[   Iterator[PageBreak]c              #  l   K   | xj         dz  c_         | j        rt          dt                    V  dS dS )zGIncrement page-number by 1 and generate a PageBreak element if enabled.r    )r   N)r   r   r*   r?   r   s    rY   _increment_page_numberz'_DocxPartitioner._increment_page_number  sR      a$ 	CB1ABBBBBBBB	C 	Cr[   c                J    t          |j                  rdS d|j        j        v S )z7True when `paragraph` can be identified as a list-item.Tz	<w:numPr>)r4   r   _pxml)r   r   s     rY   r   z_DocxPartitioner._is_list_item  s)    IN++ 	4il...r[   c              #     K   dfd |          D ]M}t          |t                    r|                     |          E d{V  3|                                 E d{V  NdS )zGenerate zero-or-more document elements for `paragraph`.

        The generated elements can be both textual elements and PageBreak elements. An empty
        paragraph produces no elements.
        r   r   rP   'Iterator[Paragraph | RenderedPageBreak]c              3     K   | j         s| V  dS | j        d         }|j        }|r|V  |V  |j        }|r |          E d{V  dS dS )an  Generate Paragraph and RenderedPageBreak items from `paragraph`.

            Each generated paragraph is the portion of the paragraph on the same page. When the
            paragraph contains no page-breaks, it is iterated unchanged and iteration stops. When
            there is a page-break, in general there one paragraph "fragment" before the page break,
            the page break, and then the fragment after the page break. However many combinations
            are possible. The first item can be either a page-break or a paragraph, but the type
            always alternates throughout the sequence.
            Nr   )contains_page_breakrendered_page_breakspreceding_paragraph_fragmentfollowing_paragraph_fragment)r   
page_breakr   r   iter_paragraph_itemss       rY   r   zG_DocxPartitioner._iter_paragraph_elements.<locals>.iter_paragraph_items  s       0 "7:J ,6+R(+ 32222
 ,6+R( , N//0LMMMMMMMMMMMN Nr[   N)r   r   rP   r   )r   r   r   r   )r   r   itemr   s      @rY   r   z)_DocxPartitioner._iter_paragraph_elements  s      	N 	N 	N 	N 	N 	NB )(33 	9 	9D$	** 9>>tDDDDDDDDDD668888888888		9 	9r[   Iterator[Dict[str, str]]c              #     K   |j         D ]C}|j        r|j                                        nd}|s'|j        r|ddV  |j        r|ddV  DdS )zLGenerate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`.r   b)r   tagiN)runsr   r   bolditalic)r   r   runr   s       rY   _iter_paragraph_emphasisz)_DocxPartitioner._iter_paragraph_emphasis!  s      > 	1 	1C'*x738>>###RD x 1#C00000z 1#C00000	1 	1r[   r   r   Iterator[Footer]c              #      K   d fd} ||j         d          E d	{V  |j        r ||j        d
          E d	{V   j        j        j        r ||j        d          E d	{V  d	S d	S )aO  Generate any `Footer` elements defined for this section.

        A Word document has up to three header and footer definition pairs for each document
        section, a primary, first-page, and even-page header and footer. The first-page pair
        applies only to the first page of the section (perhaps a title page or chapter start). The
        even-page pair is used in book-bound documents where there are both recto and verso pages
        (it is applied to verso (even-numbered) pages). A page where neither more specialized
        footer applies uses the primary footer.
        footerr   header_footer_typer>   rP   r   c           	   3     K   | j         rdS                     |           }|sdS t          |t          t	          j        |d                    V  dS )z2Generate zero-or-one Footer elements for `footer`.Nr   rD   r   category_depthr   r   r   )is_linked_to_previousr   r%   r?   r#   r   )r   r   r   r   s      rY   iter_footerz;_DocxPartitioner._iter_section_footers.<locals>.iter_footer7        + ++F33D !1(!4'9#$         r[   primaryN
first_page	even_page)r   r   r   r>   rP   r   )r   "different_first_page_header_footerfirst_page_footerr   settings odd_and_even_pages_header_footereven_page_footer)r   r   r  s   `  rY   r   z&_DocxPartitioner._iter_section_footers,  s      	 	 	 	 	 	" ;w~y9999999995 	L"{7#<lKKKKKKKKK>"C 	J"{7#;[IIIIIIIIIII	J 	Jr[   Iterator[Header]c              #      K   d fd} ||j         d          E d	{V  |j        r ||j        d
          E d	{V   j        j        j        r ||j        d          E d	{V  d	S d	S )zGenerate `Header` elements for this section if it has them.

        See `._iter_section_footers()` docstring for more on docx headers and footers.
        headerr   r   r>   rP   r  c           	   3     K   | j         rdS                     |           }|sdS t          |t          t	          j        |d                    V  dS )z2Generate zero-or-one Header elements for `header`.Nr   r  r  )r  r   r&   r?   r#   r   )r  r   r   r   s      rY   maybe_iter_headerzA_DocxPartitioner._iter_section_headers.<locals>.maybe_iter_headerT  r  r[   r  Nr  r	  )r  r   r   r>   rP   r  )r  r
  first_page_headerr   r  r  even_page_header)r   r   r  s   `  rY   r   z&_DocxPartitioner._iter_section_headersN  s      	 	 	 	 	 	" %$W^Y?????????5 	R(()BLQQQQQQQQQ>"C 	P(()A;OOOOOOOOOOO	P 	Pr[   r   intc              #     K   d fd}|j         }|t          j        k    r% |            s                                 E d{V  n<|t          j        k    r,|dk    rdS  |            r                                 E d{V  dS )a  Generate zero-or-one `PageBreak` document elements for `section`.

        A docx section has a "start" type which can be "continuous" (no page-break), "nextPage",
        "evenPage", or "oddPage". For the next, even, and odd varieties, a `w:renderedPageBreak`
        element signals one page break. Here we only need to handle the case where we need to add
        another, for example to go from one odd page to another odd page and we need a total of
        two page-breaks.
        rP   rI   c                       j         dz  dk    S )N   r   )r   r   s   rY   page_is_oddz?_DocxPartitioner._iter_section_page_breaks.<locals>.page_is_oddu  s    %)Q..r[   Nr   rP   rI   )
start_typer   	EVEN_PAGEr   ODD_PAGE)r   r   r   r  r  s   `    rY   r   z*_DocxPartitioner._iter_section_page_breaksk  s      	/ 	/ 	/ 	/ 	/ 	/ '
 )333 ;== 966888888888+444a{}} 966888888888 	r[   Iterator[Table]c              #  D  K   | j         r|                     |          nd}d                    |                     |                    }|                     |          \  }}t          |t          t          || j        | j	        | j
        |pd|pd                    V  dS )zBGenerate zero-or-one Table element for a DOCX `w:tbl` XML element.Nr   )text_as_htmlrD   page_numberlast_modifiedemphasized_text_contentsemphasized_text_tags)r   r   )r   r   rq   r   _table_emphasisr   r?   r#   r   _page_number_last_modified)r   r   
html_table
text_tabler$  r%  s         rY   r   z$_DocxPartitioner._iter_table_element  s       <@;V`T00777\`
XXd44U;;<<
9=9M9Me9T9T6 "6-$'0 -"1)A)IT%9%AT  
 
 
 	
 	
 	
 	
 	
r[   c              #  |   K   |j         D ]1}|j        D ]'}|j        D ]}|                     |          E d{V  (2dS )zHGenerate e.g. {"text": "word", "tag": "b"} for each emphasis in `table`.N)r   r   
paragraphsr   )r   r   r   r   r   s        rY   _iter_table_emphasisz%_DocxPartitioner._iter_table_emphasis  s      : 	H 	HC	 H H!% H HI#<<YGGGGGGGGGGHH	H 	Hr[   r   c           	   #      K   d	 fd}|j         D ]H}|j        }|j        D ]7}|j        dk    rd  |t	          ||                    D             E d{V  8IdS )
a  Generate text of each cell in `table` stripped of leading and trailing whitespace.

        Nested tables are recursed into and their text contributes to the output in depth-first
        pre-order. Empty strings due to empty or whitespace-only cells are dropped.
        r   r   rP   r   c              3     K   |                                  D ]c}t          |t                    r|j                                        V  3t          |t
                    r                    |          E d{V  ddS )zGenerate each text item in `cell` stripped of leading and trailing whitespace.

            This includes paragraphs as well as table cell contents.
            N)r   r   r   r   r   r   r   r   s     rY   iter_cell_textsz;_DocxPartitioner._iter_table_texts.<locals>.iter_cell_texts  s      
 #5577 B B
j)44 B$///111111	  B  $55jAAAAAAAAAB Br[   continuec              3     K   | ]}||V  	d S r   r   r   s     rY   r   z5_DocxPartitioner._iter_table_texts.<locals>.<genexpr>  s(      UUTPTUDUUUUUUr[   Nr   )r   _trtc_lstvMerger   )r   r   r0  r   trtcs   `     rY   r   z"_DocxPartitioner._iter_table_texts  s      	B 	B 	B 	B 	B 	B : 	V 	VCBi V V9
**UU__U2s^^-L-LUUUUUUUUUUUV	V 	Vr[   c                    | j         r| j         S | j        | j        }}|&|                    d          rdnt	          |          S |J t          |          S )z8Last-modified date suitable for use in element metadata.Nz/tmp)r   r   r   
startswithr1   r2   )r   rf   rF   s      rY   r(  z_DocxPartitioner._last_modified  sp    
 ' 	0//.$*4	  $//77^44=ST]=^=^^ /555r[   Optional[int]c                "    | j         r| j        ndS )a4  The current page number, or None if we can't really tell.

        Page numbers are not added to element metadata if we can't find any page-breaks in the
        document (which may be a common case).

        In the DOCX format, determining page numbers is strictly a best-efforts attempt since actual
        page-breaks are determined at rendering time (e.g. printing) based on the fontmetrics of the
        target device. Explicit (hard) page-breaks are always recorded in the docx file but the
        rendered page-breaks are only added optionally.
        N)r   r   r   s    rY   r'  z_DocxPartitioner._page_number  s     &*%GQt!!TQr[   Tuple[List[str], List[str]]c                    t          j        |                     |                    \  }}d |D             d |D             fS )z@[contents, tags] pair describing emphasized text in `paragraph`.c                    g | ]
}|d          S r   r   r   es     rY   r   z8_DocxPartitioner._paragraph_emphasis.<locals>.<listcomp>  s    000q6000r[   c                    g | ]
}|d          S r   r   r@  s     rY   r   z8_DocxPartitioner._paragraph_emphasis.<locals>.<listcomp>  s    2S2S2S1U82S2S2Sr[   )	itertoolsteer   )r   r   iter_p_emphiter_p_emph_2s       rY   _paragraph_emphasisz$_DocxPartitioner._paragraph_emphasis  sM    %.]43P3PQZ3[3[%\%\"]00K0002S2S]2S2S2STTr[   'Tuple[List[str], List[str], List[Link]]c                    j         sg g g fS dfd}t           |                      }d |D             }d |D             }|||fS )z,Describes hyperlinks in `paragraph`, if any.rP   Iterator[Link]c               3  :  K   d}                                  D ]}t          |t                    r| t          |j                  z  } /t          |t
                    r<|j        }|j        }| }| t          |          z  } |si|slt          |||          V  dS )zGenerate `Link` typed-dict for each external link in `paragraph`.

            Word uses hyperlinks for internal "jumps" within the document, as well as for web and
            other external locations. Only generate the external ones.
            r   )r   urlstart_indexN)r   r   r   lenr   r   rM  r'   )offsetr   r   rM  rN  r   s        rY   iter_paragraph_linkszC_DocxPartitioner._paragraph_link_meta.<locals>.iter_paragraph_links  s       F!4466 L LdC(( Lc$)nn,FFi00 L9D(C"(Kc$ii'F
  !    ! Dc{KKKKKK'L Lr[   c                "    g | ]}|d          pdS )r   r   r   r   links     rY   r   z9_DocxPartitioner._paragraph_link_meta.<locals>.<listcomp>  s!    ;;;Td6l(b;;;r[   c                    g | ]
}|d          S )rM  r   rS  s     rY   r   z9_DocxPartitioner._paragraph_link_meta.<locals>.<listcomp>  s    333TT%[333r[   )rP   rK  )
hyperlinksr~   )r   r   rQ  links
link_texts	link_urlss    `    rY   _paragraph_link_metaz%_DocxPartitioner._paragraph_link_meta  s    # 	r2:	L 	L 	L 	L 	L 	L8 ))++,, <;U;;;
33U333	9e++r[   r#   c                   |                      |          }|                     |          \  }}|                     |          \  }}}t          ||pd|pd| j        | j        |pd|pd|pd| j        	  	        }d|_        |S )z.ElementMetadata object describing `paragraph`.N)	r  r$  r%  rD   r#  rX  rY  rW  r"  r=   )_parse_category_depth_by_stylerH  rZ  r#   r   r(  r'  r   )	r   r   r  r$  r%  rX  rY  rW  element_metadatas	            rY   r   z$_DocxPartitioner._paragraph_metadata  s    <<YGG9=9Q9QR[9\9\6 "6'+'@'@'K'K$
Iu*)%=%E!5!=,-!)T'4-4)

 

 

 -3)r[   c                    |j                             d          }|rt          |d                   S |j        r|j        j        pd}|                     |          }|dk    r|S |                                 S )z0Determine category depth from paragraph metadataz./w:pPr/w:numPr/w:ilvl/@w:valr   Normal)_elementr   r  stylerc   #_parse_category_depth_by_style_name#_parse_category_depth_by_style_ilvl)r   r   r   
style_namedepths        rY   r\  z/_DocxPartitioner._parse_category_depth_by_style,  s     "(()HII 	!uQx==   o>)/*>K8
88DD199L ;;===r[   c                    dS )Nr   r   r   s    rY   rc  z4_DocxPartitioner._parse_category_depth_by_style_ilvl>  s    qr[   rd  c                    dd}                     d          r |          S dk    rdS g d	}t          fd
|D                       r |          S dS )zParse category-depth from the style-name of `paragraph`.

        Category depth is 0-indexed and relative to the other element types in the document.
        suffixr>   rP   r  c                    |                                  d                                         r*t          |                                  d                   dz
  ndS )Nr   r   )splitisdigitr  )rh  s    rY   _extract_numberzM_DocxPartitioner._parse_category_depth_by_style_name.<locals>._extract_numberH  sF    28,,..2D2L2L2N2NU3v||~~b)**Q..TUUr[   HeadingSubtitler   )r	   List BulletList ContinueList Numberc              3  B   K   | ]}                     |          V  d S r   )r9  )r   prefixrd  s     rY   r   zG_DocxPartitioner._parse_category_depth_by_style_name.<locals>.<genexpr>T  s1      IIz$$V,,IIIIIIr[   r   )rh  r>   rP   r  )r9  any)r   rd  rm  list_prefixess    `  rY   rb  z4_DocxPartitioner._parse_category_depth_by_style_nameB  s    	V 	V 	V 	V   ++ 	/"?:...##1 POOIIII=IIIII 	/"?:... qr[   Optional[Type[Text]]c                   |j                                         }t          |          dk     rdS t          |          rt          S t          |          rt          S t          |          rt          S t          |          rt          S dS )zEAttempt to differentiate the element-type by inspecting the raw text.r  N)r   r   rO  r8   r!   r5   r$   r6   r)   r7   r,   )r   r   r   s      rY   r   z7_DocxPartitioner._parse_paragraph_text_for_element_typeZ  s    ~##%%t99q==4%% 	ND!! 	 %d++ 	!  T"" 	Ltr[   c                   i dt           dt          dt          dt          dt          dt          dt          dt          d	t          d
t          dt           dt          dt          dt          dt          dt          dt          t          t          t          t          t          t          t          t           t           t           t          t          t          d}|j        r|j        j        pd}|                    |          S )zElement-type for `paragraph` based on its paragraph-style.

        Returns `None` when the style doesn't tell us anything useful, including when it
        is the default "Normal" style.
        Captionz	Heading 1z	Heading 2z	Heading 3z	Heading 4z	Heading 5z	Heading 6z	Heading 7z	Heading 8z	Heading 9zIntense Quoter	   zList 2zList 3rp  zList Bullet 2zList Bullet 3)rq  zList Continue 2zList Continue 3rr  zList Number 2zList Number 3zList Paragraphz
Macro Textz
No SpacingQuotero  
TOCHeadingr,   r_  )r+   r,   r(   ra  rc   get)r   r   STYLE_TO_ELEMENT_MAPPINGrd  s       rY   r   z*_DocxPartitioner._style_based_element_typek  s9   $
t$
$
 $
 	$

 $
 $
 $
 $
 $
 $
 T$
 H$
 h$
 h$
 8$
  X!$
" X#$
$ &''#%%&=$
 $
 $
 F  o>)/*>K8
 (++J777r[   c                    t          j        |                     |                    \  }}d |D             d |D             fS )z<[contents, tags] pair describing emphasized text in `table`.c                    g | ]
}|d          S r?  r   r@  s     rY   r   z4_DocxPartitioner._table_emphasis.<locals>.<listcomp>  s    222q6222r[   c                    g | ]
}|d          S rC  r   r@  s     rY   r   z4_DocxPartitioner._table_emphasis.<locals>.<listcomp>  s    4W4W4W!QuX4W4W4Wr[   )rD  rE  r-  )r   r   iter_tbl_emphiter_tbl_emph_2s       rY   r&  z _DocxPartitioner._table_emphasis  sM    )2t7P7PQV7W7W)X)X&22M2224W4W4W4W4WXXr[   )NNNTTN)rD   rE   rF   rG   rK   rE   rx   rI   rJ   rI   rL   rE   rP   r   )rD   rE   rF   rG   rK   rE   rx   rI   rJ   rI   rL   rE   rP   r   )rP   r   )r   r   rP   r   )F)r   r   r   rI   rP   r>   )rP   r   r  )r   r   rP   r>   )rP   r   )r   r   rP   rI   )r   r   rP   r   )r   r   rP   r   )r   r   rP   r  )r   r  r   r   rP   r   )r   r   rP   r  )r   r   rP   r   )r   r   rP   r   )rP   rE   )rP   r:  )r   r   rP   r<  )r   r   rP   rI  )r   r   rP   r#   )r   r   rP   r  )rP   r  )rd  r>   rP   r  )r   r   rP   rw  )r   r   rP   r<  )&__name__
__module____qualname____doc__r   classmethodr}   r   r   r   r   r:   r   r   r   r   r   r   r   r   r   r   r   r   r-  r   r(  propertyr'  rH  rZ  r   r\  rc  rb  r   r   r&  r   r[   rY   r|   r|      s:       BB #'$(+/$(&*04$ $ $ $ $*  #'$(+/$(&*04
 
 
 
 [
8; ; ; ;:@ @ @ @)O )O )O )OV(
 (
 (
 (
 (
T # # # \# 9 9 9 \9( - - - \-M M M M<C C C C/ / / /,9 ,9 ,9 ,9\	1 	1 	1 	1 J  J  J  JDP P P P:$ $ $ $L
 
 
 
*H H H HV V V V< 6 6 6 \6$ R R R XRU U U U
&, &, &, &,P       &> > > >$      0   "08 08 08 08dY Y Y Y Y Yr[   r|   )rC   r>   rD   rE   rF   rG   rH   rI   rJ   rI   rK   rE   rL   rE   rM   rN   rO   rI   rP   rQ   )rD   rE   rF   rG   rK   rE   rx   rI   rH   rI   rJ   rI   rL   rE   ry   rE   rM   rN   rO   rI   rz   r   rP   rQ   )^
__future__r   r   r   rD  rU   r_   r   typingr   r   r   r   r	   r
   r   r   r   r   r=   docx.documentr   docx.enum.sectionr   docx.oxml.tabler   docx.oxml.text.paragraphr   docx.sectionr   r   r   
docx.tabler   r   r   r   docx.text.hyperlinkr   docx.text.pagebreakr   docx.text.paragraphr   docx.text.runr   r   typing_extensionsr   unstructured.chunkingr   unstructured.cleaners.corer    unstructured.documents.elementsr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-    unstructured.file_utils.filetyper.   r/   unstructured.partition.commonr0   r1   r2   unstructured.partition.langr3    unstructured.partition.text_typer4   r5   r6   r7   r8   unstructured.utilsr9   r:   r;   r<   r?   __annotations__r@   rA   rw   DOCXrs   r|   r   r[   rY   <module>r     s   # " " " " " "  				     				  ) ) ) ) ) )                         " " " " " " . . . . . . " " " " " " ) ) ) ) ) ) 2 2 2 2 2 2 2 2 2 2 ) ) ) ) ) ) " " " " " " " " ) ) ) ) ) ) 1 1 1 1 1 1 ) ) ) ) ) )             ' ' ' ' ' ' 7 7 7 7 7 7 4 4 4 4 4 4                                 R Q Q Q Q Q Q Q         
 < ; ; ; ; ;              V U U U U U U U U UZ   OOO     f- - - - -Y	12	 2 2 2 2 z"" # $!"&'+,0&,X(-S S S S #"Sl HM**" $'+ $!"&,0'+&,X(-8 8 8 8  +* 8vy
Y y
Y y
Y y
Y y
Y y
Y y
Y y
Y y
Y y
Yr[   