
    jB                       d dl mZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7m8Z8m9Z9 d dl:m;Z; dZ< e+             e.e-j=                   e             ddddddddddgdfd.d+                                    Z> G d, d-          Z?dS )/    )annotationsN)SpooledTemporaryFile)IOAnyIteratorListOptionalSequenceTupleUnion)Presentation)Shape)	BaseShape)GraphicFrame)
GroupShape)_BaseGroupShapes)Slide)
_Paragraph)add_chunking_strategy)
ElementElementMetadataEmailAddressListItemNarrativeText	PageBreakTableTextTitleprocess_metadata)FileTypeadd_metadata_with_filetype)convert_ms_office_table_to_textexactly_oneget_last_modified_date get_last_modified_date_from_file)apply_lang_metadata)is_email_addressis_possible_narrative_textis_possible_title)lazypropertypptxTFautofilenameOptional[str]fileOptional[IO[bytes]]include_page_breaksboolmetadata_filenameinclude_metadatametadata_last_modifiedinclude_slide_notesinfer_table_structurechunking_strategy	languagesOptional[List[str]]detect_language_per_elementkwargsr   returnList[Element]c                T   t          | |           t          |t                    r;|                    d           t	          j        |                                          }|p| }|J t                              ||||||          }t          ||	|
          }t          |          S )a  Partition PowerPoint document in .pptx format into its document elements.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    include_page_breaks
        If True, includes a PageBreak element between slides
    metadata_filename
        The filename to use for the metadata. Relevant because partition_ppt() converts its
        (legacy) .ppt document to .pptx before partition. We want the filename of the original
        .ppt source file in the metadata.
    metadata_last_modified
        The last modified date for the document.
    include_slide_notes
        If True, includes the slide notes as element
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    )r-   r/   r   N)elementsr9   r;   )r#   
isinstancer   seekioBytesIOread_PptxPartitioneriter_presentation_elementsr&   list)r-   r/   r1   r3   r4   r5   r6   r7   r8   r9   r;   r<   source_filer@   s                 e/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/partition/pptx.pypartition_pptxrK   0   s    d ----
 $,-- '		!z$))++&&"(K""":: H #$?  H
 >>    c                      e Zd ZdZ	 	 	 	 	 d6d7dZed8d            Zd9dZed:d            Z	d;dZ
d<dZd=dZd>d Zd?d!Zd?d"Zd@d&Zed:d'            ZdAd)ZedBd+            ZedCd-            ZdDd0ZdEdFd5ZdS )GrF   z;Provides `.partition()` for PowerPoint 2007+ (.pptx) files.TFNr/   Union[str, IO[bytes]]r1   r2   r6   r7   r3   r.   r5   r=   Nonec                h    || _         || _        || _        || _        || _        || _        d| _        d S Nr   )_file_include_page_breaks_include_slide_notes_infer_table_structure_metadata_filename_metadata_last_modified_page_counter)selfr/   r1   r6   r7   r3   r5   s          rJ   __init__z_PptxPartitioner.__init__   sA     
$7!$7!&;#"3'=$rL   Iterator[Element]c                F     | ||||||                                           S )zFPartition MS Word documents (.docx format) into its document elements.)_iter_presentation_elements)clsr/   r1   r6   r7   r3   r5   s          rJ   rG   z+_PptxPartitioner.iter_presentation_elements   s8     s!"
 
 &
%
'
'	(rL   c              #    K   | j         j        D ]}|                                 E d{V  |                     |          E d{V  |                     |          \  }}|D ]}|j        r3t          |t                    sJ |                     |          E d{V  <|j	        rTt          |t                    sJ ||k    r|                     |          E d{V  ||                     |          E d{V  dS )zAGenerate each document-element in presentation in document order.N)_presentationslides_increment_page_number_iter_maybe_slide_notes_order_shapes	has_tablerA   r   _iter_table_elementhas_text_framer   _iter_title_shape_element_iter_shape_elements)rY   slidetitle_shapeshapesshapes        rJ   r]   z,_PptxPartitioner._iter_presentation_elements   sk      '. 	D 	DE2244444444433E:::::::::"&"4"4U";";K 	D 	D? D%e\:::::#77>>>>>>>>>>) D%eU33333++#'#A#A%#H#HHHHHHHHH#'#<#<U#C#CCCCCCCC	D	D 	DrL   c                d    | j         r| j         n"t          | j        t                    r| j        ndS )zMSuitable for use as metadata.filename, does not necessarily name source-file.N)rV   rA   rR   strrY   s    rJ   	_filenamez_PptxPartitioner._filename   s;    
 &D## $*c**	
rL   Iterator[PageBreak]c              #     K   | xj         dz  c_         | j         dk     rdS | j        rt          dt                    V  dS dS )zGIncrement page-number by 1 and generate a PageBreak element if enabled.      N )detection_origin)rX   rS   r   DETECTION_ORIGINrp   s    rJ   rb   z'_PptxPartitioner._increment_page_number   se      a!!F$ 	CB1ABBBBBBBB	C 	CrL   	paragraphr   c                P    t          |j                            d                    S )zTrue when `paragraph` has a bullet-charcter prefix.

        Bullet characters in the openxml schema are represented by buChar.
        z./a:pPr/a:buChar)r2   _pxpath)rY   ry   s     rJ   _is_bulleted_paragraphz'_PptxPartitioner._is_bulleted_paragraph   s#     IL&&'9::;;;rL   rj   r   Iterator[NarrativeText]c              #     K   | j         sdS |j        sdS |j        }|j        }|sdS |j                                        }|sdS t          ||                                 t                    V  dS )z?Generate zero-or-one NarrativeText element for the slide-notes.Ntextmetadatarw   )	rT   has_notes_slidenotes_slidenotes_text_framer   stripr   _text_metadatarx   )rY   rj   r   r   
notes_texts        rJ   rc   z(_PptxPartitioner._iter_maybe_slide_notes   s       ( 	F $ 	F'&7   	F%*0022
  	F((**-
 
 
 	
 	
 	
 	
 	
rL   rm   r   c                d    t          |j        o|j        o|j        dk     p
|j        dk               S rQ   )r2   topleft)rY   rm   s     rJ   _is_invalid_shapez"_PptxPartitioner._is_invalid_shape  s2     UY-5:TEIM4SUZRS^UUUrL   c              #    K   |                      |          rdS d}|j        j        D ]}|j        }|                                dk    r"|                     |          r7|j        pd}t          ||                     |          t                    V  nt          |          rt          |t                    V  t          ||                     |          t                    V  |dz  }dS )zGenerate Title element for each paragraph in title `shape`.

        Text is most likely a title, but in the rare case that the title shape was used
        for the slide body text, also check for bulleted paragraphs.Nr   rv   category_depthr   r   rw   rt   )r   
text_frame
paragraphsr   r   r}   levelr   r   rx   r'   r   r   )rY   rm   depthry   r   bullet_depths         rJ   rh   z*_PptxPartitioner._iter_title_shape_element  s?     
 !!%(( 	F)4 	 	I>Dzz||r!!**955 (3!!000MM%5      
 "$'' 	"?OPPPPPPP !000FF%5     
 
+	 	rL   c              #    K   |                      |          rdS |j        j        D ]}|j        }|                                dk    r#|j        pd}|                     |          }|                     |          rt          ||t                    V  qt          |          rt          |t                    V  t          |          rt          ||t                    V  t          |          r4|                     |dz             }t          ||t                    V  t!          ||t                    V   dS )z?Generate Text or subtype element for each paragraph in `shape`.Nrv   r   r   r   r   rt   )r   r   r   r   r   r   r   r}   r   rx   r'   r   r(   r   r)   r   r   )rY   rm   ry   r   r   r   s         rJ   ri   z%_PptxPartitioner._iter_shape_elements(  s     !!%(( 	F)4 	\ 	\I>Dzz||r!!O(qE**%*@@H**955 \D8N^_______!$'' \"?OPPPPPPP+D11 \#%%5      
 #4(( \..eai.HHK[\\\\\\\xJZ[[[[[[[/	\ 	\rL   graphfrmr   Iterator[Table]c              #     K   t          |j        d                                          }|sdS d}| j        rt          |j        d          }t	          ||                     |          t                    V  dS )zzGenerate zero-or-one Table element for the table in `shape`.

        An empty table does not produce an element.
        F)as_htmlNTr   )r"   tabler   rU   r   _table_metadatarx   )rY   r   
text_table
html_tables       rJ   rf   z$_PptxPartitioner._iter_table_elementF  s      
 5X^USSSYY[[
 	F
& 	W8QUVVVJ))*55-
 
 
 	
 	
 	
 	
 	
rL   c                    | j         r| j         S | j        }t          |t                    r&|                    d          rdnt          |          S t          |          S )z8Last-modified date suitable for use in element metadata.z/tmpN)rW   rR   rA   ro   
startswithr$   r%   )rY   r/   s     rJ   _last_modifiedz_PptxPartitioner._last_modifiedW  sh    
 ' 	0//z dC   	U??622T448Nt8T8TT
 0555rL   +Tuple[Optional[Shape], Sequence[BaseShape]]c                n    dfddd	}|j         j        t           |j                   |
          fS )zOrders the shapes on `slide` from top to bottom and left to right.

        Returns the title shape if it exists and the ordered shapes.rl   r   r=   Iterator[BaseShape]c              3  v   K   | D ]2}t          |t                    r |j                  E d {V  .|V  3d S )N)rA   r   rl   )rl   rm   iter_shapess     rJ   r   z3_PptxPartitioner._order_shapes.<locals>.iter_shapeso  sd          eZ00  *{5<8888888888KKKK	   rL   rm   r   Tuple[int, int]c                &    | j         pd| j        pdfS rQ   )r   r   )rm   s    rJ   sort_keyz0_PptxPartitioner._order_shapes.<locals>.sort_keyv  s    9>5:?22rL   )key)rl   r   r=   r   )rm   r   r=   r   )rl   titlesorted)rY   rj   r   r   s      @rJ   rd   z_PptxPartitioner._order_shapesj  s`    
	  	  	  	  	  	 	3 	3 	3 	3 |!6++el*C*C#R#R#RRRrL   Optional[int]c                    | j         S )z The current page (slide) number.)rX   rp   s    rJ   _page_numberz_PptxPartitioner._page_number{  s     !!rL   r   c                4    t          j        | j                  S )zKThe python-pptx `Presentation` object loaded from the provided source file.)r+   r   rR   rp   s    rJ   r`   z_PptxPartitioner._presentation  s      ,,,rL   text_as_htmlro   c                b    t          | j        | j        | j        |          }t          |_        |S )z=ElementMetadata instance suitable for use with Table element.)r-   last_modifiedpage_numberr   r   rq   r   r   rx   rw   )rY   r   element_metadatas      rJ   r   z _PptxPartitioner._table_metadata  s<    *^-)%	
 
 
 -=)rL   r   r   intr   c                b    t          | j        | j        | j        |          }t          |_        |S )zAElementMetadata instance suitable for use with Text and subtypes.)r-   r   r   r   r   )rY   r   r   s      rJ   r   z_PptxPartitioner._text_metadata  s<    *^-))	
 
 
 -=)rL   )TFTNN)r/   rN   r1   r2   r6   r2   r7   r2   r3   r.   r5   r.   r=   rO   )r/   rN   r1   r2   r6   r2   r7   r2   r3   r.   r5   r.   r=   r[   )r=   r[   )r=   r.   )r=   rr   )ry   r   r=   r2   )rj   r   r=   r~   )rm   r   r=   r2   )rm   r   r=   r[   )r   r   r=   r   )rj   r   r=   r   )r=   r   )r=   r   )r   ro   )r   )r   r   r=   r   )__name__
__module____qualname____doc__rZ   classmethodrG   r]   r*   rq   rb   r}   rc   r   rh   ri   rf   r   rd   propertyr   r`   r   r    rL   rJ   rF   rF   ~   s       EE %)$)&*+/04    ( ( ( ( [(&D D D DB 
 
 
 \
C C C C< < < <
 
 
 
8V V V V
   @\ \ \ \<
 
 
 
" 6 6 6 \6$S S S S" " " " X" - - - \-	  	  	  	 	  	  	  	  	  	  	 rL   rF   )r-   r.   r/   r0   r1   r2   r3   r.   r4   r2   r5   r.   r6   r2   r7   r2   r8   r.   r9   r:   r;   r2   r<   r   r=   r>   )@
__future__r   rC   tempfiler   typingr   r   r   r   r	   r
   r   r   r+   pptx.presentationr   pptx.shapes.autoshaper   pptx.shapes.baser   pptx.shapes.graphfrmr   pptx.shapes.groupr   pptx.shapes.shapetreer   
pptx.slider   pptx.text.textr   unstructured.chunkingr   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   r    unstructured.file_utils.filetyper    r!   unstructured.partition.commonr"   r#   r$   r%   unstructured.partition.langr&    unstructured.partition.text_typer'   r(   r)   unstructured.utilsr*   rx   PPTXrK   rF   r   rL   rJ   <module>r      s   " " " " " " 				 ) ) ) ) ) ) L L L L L L L L L L L L L L L L L L L L  * * * * * * ' ' ' ' ' ' & & & & & & - - - - - - ( ( ( ( ( ( 2 2 2 2 2 2       % % % % % % 7 7 7 7 7 7                        R Q Q Q Q Q Q Q            < ; ; ; ; ;         
 , + + + + +  HM**" $ $'+!,0 %"&'+&,X(-H H H H  +* HV[  [  [  [  [  [  [  [  [  [ rL   