
    jW                    8   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZmZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZmZm Z m!Z! d dl"m#Z# d dl$m%Z% d d	l&m'Z( d d
l)m*Z* d dl+m,Z,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5m'Z'm6Z6m7Z7m8Z8m9Z9m:Z: d dl;m<Z<m=Z= d dl>m?Z?m@Z@ d dlAmBZB d dlCmDZDmEZEmFZFmGZGmHZHmIZImJZJ d dlKmLZLmMZM d dlNmOZOmPZPmQZQ d dlRmSZS d dlTmUZUmVZV d dlWmXZXmYZY d dlZm[Z[ d dl\m]Z]m^Z^m_Z_m`Z`maZambZb d dlcmdZd d dlemfZfmgZg d dlhmiZi d dljmkZk er	 eiejl        _m         ejn        dejo                  Zpdd#Zq e:             e=e<jr                   e*            d$dd%ebjs        d%ddd&dddg dd%ddd%fdd<                                    Zt	 	 	 	 	 ddd@Zu	 	 dddCZv ekdD          d$dd%d%d%deajw        jx        ddddd%d%ddd%d%dfddN            Zyd$dd%d%ebjs        d%ddddd%ddd%fddOZzddQZ{ ekdRdS          ddV            Z|ddYZ} ej~        dZd[          d\             Ze`fdd`ZddcZddiZddmZ	 	 	 dddrZ ekdsdD          d$dd%dtgd%dfddu            Zddvd%de`fddzZ	 	 dddZddZddZddZddZddZ	 dddZddZddZddZdS )    )annotationsN)SpooledTemporaryFile)IOTYPE_CHECKINGAnyBinaryIODictIteratorListOptionalSequenceTupleUnioncast)psparser)LTCharLTContainerLTImageLTItem	LTTextBox)	PDFObjRef)open_filename)Image)add_chunking_strategy)%clean_extra_whitespace_with_index_run-index_adjustment_after_clean_extra_whitespace)
PixelSpace
PointSpace)
CoordinatesMetadataElementElementMetadataElementTyper   LinkListItem	PageBreakTextprocess_metadata)FileTypeadd_metadata_with_filetype)loggertrace_logger)PARAGRAPH_PATTERN)convert_to_bytesdocument_to_element_listexactly_oneget_last_modified_date get_last_modified_date_from_fileocr_data_to_elementsspooled_to_bytes_io_if_needed)check_languagesprepare_languages_for_tesseract)annotate_layout_elementscheck_element_types_to_extractsave_elements)$merge_inferred_with_extracted_layout)open_pdfminer_pages_generatorrect_to_bbox)determine_pdf_or_image_strategyvalidate_strategy)element_from_text)OCR_AGENT_TESSERACTSORT_MODE_BASICSORT_MODE_DONTSORT_MODE_XY_CUTOCRModePartitionStrategy)clean_pdfminer_inner_elements)coord_has_valid_pointssort_page_elements)parse_keyword)requires_dependenciesz\s+)patternflagsinfer_table_structureboolreturnstrc                N    | rdnd}t           j                            d|          S )Nyoloxyolox_quantizedUNSTRUCTURED_HI_RES_MODEL_NAME)osenvironget)rL   defaults     d/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/partition/pdf.pydefault_hi_res_modelrY   t   s*     /Egg4EG:>>:GDDD     FTfilenamefile/Optional[Union[BinaryIO, SpooledTemporaryFile]]include_page_breaksstrategyocr_languagesOptional[str]	languagesOptional[List[str]]include_metadatametadata_filenamemetadata_last_modifiedchunking_strategylinksSequence[Link]hi_res_model_nameextract_images_in_pdfextract_element_typesimage_output_dir_pathextract_to_payloadList[Element]c                v    t          | |           t          ||          }t          d| ||||||	|||||d|S )a  Parses a pdf document into a list of interpreted elements.
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object as bytes --> open(filename, "rb").
    strategy
        The strategy to use for partitioning the PDF. Valid strategies are "hi_res",
        "ocr_only", and "fast". When using the "hi_res" strategy, the function uses
        a layout detection model to identify document elements. When using the
        "ocr_only" strategy, partition_pdf simply extracts the text from the
        document using OCR and processes it. If the "fast" strategy is used, the text
        is extracted directly from the PDF. The default strategy `auto` will determine
        when a page can be extracted using `fast` mode, otherwise it will fall back to `hi_res`.
    infer_table_structure
        Only applicable if `strategy=hi_res`.
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        The languages present in the document, for use in partitioning and/or OCR. To use a language
        with Tesseract, you'll first need to install the appropriate Tesseract language pack.
    metadata_last_modified
        The last modified date for the document.
    hi_res_model_name
        The layout detection model used when partitioning strategy is set to `hi_res`.
    extract_images_in_pdf
        Only applicable if `strategy=hi_res`.
        If True, any detected images will be saved in the path specified by 'image_output_dir_path'
        or stored as base64 encoded data within metadata fields.
        Deprecation Note: This parameter is marked for deprecation. Future versions will use
        'extract_element_types' for broader extraction capabilities.
    extract_element_types
        Only applicable if `strategy=hi_res`.
        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
        saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
        within metadata fields.
    extract_to_payload
        Only applicable if `strategy=hi_res`.
        If True, images of the element type(s) defined in 'extract_element_types' will be encoded
        as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
        This parameter facilitates the inclusion of element data directly within the payload,
        especially for web-based applications or APIs.
    image_output_dir_path
        Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
        The filesystem path for saving images of the element type(s)
        specified in 'extract_element_types'.
    r\   r]   )r\   r]   r_   r`   rL   rc   rg   rk   rl   rm   rn   ro    )r/   r4   partition_pdf_or_image)r\   r]   r_   r`   rL   ra   rc   re   rf   rg   rh   ri   rk   rl   rm   rn   ro   kwargss                     rX   partition_pdfrv   ~   sq    V ----	=99I! /35+333-    rZ   !Optional[Union[bytes, IO[bytes]]]ru   r   c           	     x    t          |t                    rt          j        |          }t	          d| ||||d|S )Nr\   r]   r_   rc   rg   rs   )
isinstancebytesioBytesIO_partition_pdf_with_pdfminer)r\   r]   r_   rc   rg   ru   s         rX   extractable_elementsr      sZ     $  z$' /5    rZ   6Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]]Union[str, None]c                \    d }| s|rt          |          }n|s| rt          |           }|S )N)r\   )r]   )r0   r1   )r]   r\   last_modification_dates      rX   )get_the_last_modification_date_pdf_or_imgr      sW     " MH M!7!J!J!J M$ M!At!L!L!L!!rZ   unstructured_inference Optional[Union[bytes, BinaryIO]]is_imageocr_mode
model_namepdf_image_dpiOptional[int]pdf_text_extractableanalysisanalyzed_image_output_dir_pathc                *   ddl m}m} ddlm}m} ddlm}m} |dg}t          |          }|p|pt          |          }|	
|dk    rdnd	}	|	dk     r|dk    rt          j        d
|	 d           |s || |||	          }|r || |	          ng }|rt          ||| ||	|           t          ||          }|                    d          r|}n || ||||||	          }n |||||	          }t!          |d          r|                    d           |r |||	          ng }t          ||          }|                    d          r|}n7t!          |d          r|                    d            ||||||||	          }|dk    r
t$          |d<   t'          |          }|j        D ]}|j        D ]}|j        pd|_        t/          |fd||
d|d|} t1          |          }|r!t3          | t4          j        | |||	||           |D ],}!|r|!t4          j        k    rt3          | |!| |||	||           -g }"| D ]N}t9          |t:                    r|st9          |t<                    rw|sLt4          j        |vr>|j        6t?          |j                  dk     s|j                             d          dk    r~|"!                    tE          tF          |                     t9          |tH                    rtK          j&        tN          d|j        pd          (                                |_        |j        s*t9          |t:                    s|                    d          r(|"!                    tE          tF          |                     P|"S )z)Partition using package installed locallyr   )process_data_with_modelprocess_file_with_model)process_data_with_ocrprocess_file_with_ocr)process_data_with_pdfminerprocess_file_with_pdfminerNengchipperi,     zVThe Chipper model performs better when images are rendered with DPI >= 300 (currently z).)r   r   r   )r\   dpi)inferred_document_layoutextracted_layoutr\   output_dir_pathr   r   )r   r   )r   rL   ra   r   r   seek)r]   r   	sort_moder[   TF)sortabler_   r   infer_list_itemsrc   )elementselement_category_to_saver\   r]   r   r   ro   r       ))'unstructured_inference.inference.layoutr   r   $unstructured.partition.pdf_image.ocrr   r   4unstructured.partition.pdf_image.pdfminer_processingr   r   r5   rY   r*   warningr6   r9   
startswithhasattrr   rA   rE   pagesr   textr.   r7   r8   r"   IMAGErz   r%   r   lenfindappendr   r    r&   resub RE_MULTISPACE_INCLUDING_NEWLINESstrip)#r\   r]   r   rL   r_   rc   r   r   rk   r   rg   r   rl   rm   rn   ro   r   r   ru   r   r   r   r   r   r   ra   r   r   merged_document_layoutfinal_document_layoutpageelr   el_typeout_elementss#                                      rX   _partition_pdf_or_image_localr      s   .       
              
 G	3I>>M 	VZV+?@U+V+V  0I==3"3y"@"@,', , ,	
 	
 	

 |#:#:('	$
 $
 $
  $&&mLLLL 	  	$)A!1! >+!    "F%=-"
 "
 "

 ''	22 	$:!!$9$9&!&;+!+% % %!! $;#:('	$
 $
 $
  4   	IIaLLL I]d&&DmDDDDbd 	
 "F%=-"
 "
 "

 ''	22 	$:!!tV$$ 		!$9$9&!&;+!+% % %! I%%,{9:OPP%+ $ $- 	$ 	$BgmBGG	$ (/5    H ;;PQQ  

%0%6'11		
 		
 		
 		
 ) 
 
  	W0A%A%A%,'11		
 		
 		
 		
 		
 L 7 7b)$$ 	-@ 	b%   	7)7%-BBBW_BGr(9(9RW\\#=N=NRT=T=T ##D"$5$56666D!! 		7f02  egg	 G w 7*R33 77H7S7ST]7^7^ 7##D"$5$5666rZ   c                   t          ||           t          ||          }t          ||           }g }d}|s{	 t          d| t	          |          |||p|d|}t          d |D                       }n?# t          $ r2}t          j        |           t          j	        d           Y d}~nd}~ww xY wt          |||||
|          }||                    d           |t          j        k    rxt          j                    5  t          j        d	           t#          d| t	          |          |||||p||	||
|||d
|}t%          |          }ddd           n# 1 swxY w Y   nq|t          j        k    r|S |t          j        k    rOt          j                    5  t+          d| |||||p|d|}t%          |          }ddd           n# 1 swxY w Y   |S )zCParses a pdf or image document into a list of interpreted elements.)r]   r\   Fry   c              3  r   K   | ]2}t          |t                    o|j                                        V  3d S )N)rz   r&   r   r   ).0r   s     rX   	<genexpr>z)partition_pdf_or_image.<locals>.<genexpr>  sL       ' '=?
2t$$8' ' ' ' ' 'rZ   z3PDF text extraction failed, skip text extraction...N)r   r   rL   rl   rm   r   ignore)r\   r]   r   rL   r_   rc   rg   rk   r   rl   rm   rn   ro   )r\   r]   r_   rc   r   rg   rs   )r=   r4   r   r   r3   any	Exceptionr*   errorr   r<   r   rD   HI_RESwarningscatch_warningssimplefilterr   $_process_uncategorized_text_elementsFASTOCR_ONLY _partition_pdf_or_image_with_ocr)r\   r]   r   r_   r`   rL   ra   rc   rg   rk   rl   rm   rn   ro   ru   r   extracted_elementsr   er   r   s                        rX   rt   rt     sJ   . h)))	=99IF  
   R	R!5 "!2488$7#'='WAW" " " " $' ' 'CU' ' ' $ $    	R 	R 	RLOOONPQQQQQQQQ	R /1333  H 		!$+++$&& 	J 	J!(+++4 !2488!&;$7#'='WAW"3%9&;&;&;#5   H  @IIL%	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J( 
&+	+	+!!	&/	/	/$&& 
	J 
	J7 !$7#!'='WAW   H @IIL
	J 
	J 
	J 
	J 
	J 
	J 
	J 
	J 
	J 
	J 
	J 
	J 
	J 
	J 
	J s<   :A4 4
B0>(B++B0AEE"E%GGGr   c                    g }| D ]r}t          |d          rI|j        t          j        k    r4t	          t          t          |          j                  }|j        |_        n|}|	                    |           s|S )zProcesses a list of elements, creating a new list where elements with the
    category `UncategorizedText` are replaced with corresponding
    elements created from their text content.category)
r   r   r"   UNCATEGORIZED_TEXTr>   r   r&   r   metadatar   )r   r   r   new_els       rX   r   r   >  s~    
 L $ $2z"" 	r{k6T'T'T&tD"~~':;;F kFOOFF####rZ   pdfminerzlocal-inferenceOptional[IO[bytes]]	List[str]c           
     (   |dg}t          | |           | rPt          | d          5 }t          t          |          }t	          d|| |||d|}ddd           n# 1 swxY w Y   n)|r't          t          |          }t	          d|| |||d|}|S )a  Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
    processing or detectron2 is not available.

    Implementation is based on the `extract_text` implemenation in pdfminer.six, but
    modified to support tracking page numbers and working with file-like objects.

    ref: https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/high_level.py
    Nr   rr   rb)fpr\   r_   rc   rg   rs   )r/   r   r   r   _process_pdfminer_pages)r\   r]   r_   rc   rg   ru   r   r   s           rX   r~   r~   O  s"   " G	---- 
8T** 		bh##B. !$7#'=   H		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 
 	
(D!!* 
 3#9
 
 
 
 Os   (AA!$A!itemr   c                    t          | d          r|                                 S t          | t                    rd}| D ]}|t	          |          pdz  }|S t          | t
          t          f          rdS dS )zrRecursively extracts text from PDFMiner objects to account
    for scenarios where the text is in a sub-container.get_textr[   
)r   r   rz   r   _extract_textr   r   )r   r   childs      rX   r   r   ~  s     tZ   }}	D+	&	& 	 	/ 	/EM%((.B.DD	D9g.	/	/  t4rZ   zpdfminer.pdfinterpz!PDFPageInterpreter.init_resourcesc                6    |d         }d|v r|d=  | |          S )Nr   
ColorSpacers   )wrappedinstanceargsru   	resourcess        rX   #pdfminer_interpreter_init_resourcesr     s.    QIy  l#79rZ   r   r   r   c                   g }t          t          |                     D ]P\  }\  }	}
|
j        |
j        }}g }g }t	          ||          }|	j        rt          |	j        |||dz             }|
D ]}t          |j        |          \  }}}}||||f}g }t          |          dk    rdt          |t                    rOt          |||dz             }t          ||          \  }}|D ]%}|                    t          ||                     &t!          |d          r|                                g}n)t%          |          }t'          j        t*          |          }|D ]}t-          |          \  }}|                                r||f||f||f||ff}t1          |||          }t3          ||          } t5          ||          }!t7          ||dz   | ||!|          |_        d|j        _        |                    |           t=          ||          }t?          |t@                    }"|t@          k    rt?          |"|          }"||"z  }|r#|                    tC          d	
                     R|S )z>Uses PDFMiner to split a document into pages and process them.)widthheight   r   r   )coordinatescoordinate_systempointssystem)r\   page_numberr   last_modifiedri   rc   r   r[   r   )"	enumerater:   r   r   r   annotsget_urisr;   bboxr   rz   r    check_annotations_within_element"get_word_bounding_box_from_elementr   map_bbox_and_indexr   r   r   r   splitr,   r   r   r>   r   _get_links_from_urls_metadatar!   r   detection_origin_combine_list_elementsrG   r@   r%   )#r   r\   r_   rc   rg   r   ru   r   ir   page_layoutr   r   page_elementsannotation_listr   objx1y1x2y2r   urls_metadataannotations_within_element_wordsannot_text_snippets_textmoved_indicesr   elementcoordinates_metadatari   sorted_page_elementss#                                      rX   r   r     s    !H"+,I",M,M"N"N H0 H0D+#);+=v')&
 
 
 ; 	V&t{F<MqSTuUUO .	2 .	2C)#(F;;NBBB#D24M?##a''JsI,F,F'-M#E. .*
 >c6JJ57 K KE!(();E5)I)IJJJJsJ'' D(+'7%c**!#*;U!C!C' 2 2'LU'S'S$};;== 2!2hR2r(RHEF/$**;  G
 ,?%0, , ,( :-WWE'6!)$%E$8&<#"+( ( (G$ 9CG$5!((111124 /}>OPP  2-QQ''#56JI#V#V (( 	0OOI2...///OrZ   r   Union[PixelSpace, PointSpace]c                @   d}g }| D ]}t          |t                    r|}|j        }|j        j        }nT|rRt          |j        j        |          r7| d|j         |_        t          |||          }|                                 |                    |           |S )zECombine elements that should be considered a single ListItem element.N)r   boundaryr   )element1element2r   )	rz   r$   r   r   r   check_coords_within_boundary"_combine_coordinates_into_element1popr   )r   r   tmp_elementupdated_elementsr  tmp_text
tmp_coordss          rX   r   r     s     K&( ) )gx(( 	#!K|H )5JJ 	#9(4
 
 
 	# #+;;W\;;K8$ "3  G   """((((rZ   r  List[Dict[str, Any]]r  
np.ndarray
List[Link]c           
         g }| D ]l}t          j        t                    5  |                    |d         |d         t	          |d         |          d           ddd           n# 1 swxY w Y   m|S )z+Extracts links from a list of URL metadata.r   uristart_index)r   urlr$  N)
contextlibsuppress
IndexErrorr   r   )r  r  ri   r%  s       rX   r   r     s     E   ,, 
	 
	LLKu:#PM*%$ $ 	 	 	
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 Ls   :A&&A*	-A*	r  r    r  c                   t          | j        j        j        d         d         |j        j        j        d         d                   }t	          | j        j        j        d         d         |j        j        j        d         d                   }t          | j        j        j        d         d         |j        j        j        d         d                   }t	          | j        j        j        d         d         |j        j        j        d         d                   }||f||f||f||ff}t          ||          | j        _        | S )zXCombine the coordiantes of two elements and apply the updated coordiantes to `elements1`r      r   r   )minr   r   r   maxr   )r  r  r   r  r  r  r  r   s           rX   r  r  '  s=    
%,Q/2%,Q/2
 
B 
%,Q/2%,Q/2
 
B 
%,Q/2%,Q/2
 
B 
%,Q/2%,Q/2
 
B 2hR2r(RH5F$7 % % %H! OrZ   
   
chunk_sizeintIterator[PILImage.Image]c              #    K   t          | |           |$t          |          }t          j        |          }nd }t          j        |           }|d         }t          d|dz   |          D ]R}t          ||z   dz
  |          }|t          j        |||          }nt          j        | ||          }|D ]}	|	V  Sd S )Nrr   Pagesr   )
first_page	last_page)	r/   r-   	pdf2imagepdfinfo_from_bytespdfinfo_from_pathranger+  convert_from_bytesconvert_from_path)
r\   r]   r.  f_bytesinfototal_pages
start_pageend_pagechunk_imagesimages
             rX   convert_pdf_to_imagesrB  C  s      ----"4((+G44*844w-KA{Q
;;  
zJ.2K@@$7%"  LL %6%"  L " 	 	EKKKK	 rZ   unstructured_pytesseractr   c           
        g }|rg }|t          j        |          nt          j        |           }	|                    |	           t          |          D ]/\  }
}	t	          d|	||
dz   ||d|}|                    |           0nAd}t          | |          D ].}	|dz  }t	          d|	||||d|}|                    |           /|S )zkPartitions an image or PDF using OCR. For PDFs, each page is converted
    to an image prior to processing.Nr   )rA  rc   r   r_   rg   r   rs   )PILImageopenr   r   +_partition_pdf_or_image_with_ocr_from_imageextendrB  )r\   r]   r_   rc   r   rg   ru   r   imagesrA  r   r  r   s                rX   r   r   e  s0    H +'+'7d###X]8=T=Te!&)) 		+ 		+HAuG #E$7'=   M OOM****		+ *8T:: 
	+ 
	+E1KG #'$7'=   M OOM****OrZ   r   rA  rE  r   c                j   ddl m}m}  |            }	t          |          }
|	t          k    rt
          } || |
|	          }t          || j        ||          }t          || j	        |          }|}|t
          k    rt          ||          }|r#|                    t          d                     |S )zQExtract `unstructured` elements from an image using OCR and perform partitioning.r   )get_layout_elements_from_ocrget_ocr_agent)rA  ra   	ocr_agent)r   filetyper   rc   )
image_sizecommon_metadatar[   r   )r   rK  rL  r5   r?   rA   r!   formatr2   sizerG   r   r%   )rA  rc   r   r_   rg   r   ru   rK  rL  rM  ra   ocr_datar   r  r  s                  rX   rG  rG    s          
 I3I>>M '''"	++#  H ,	  H ):   M )N""1-KK 8##I2$6$6$6777rZ   皙?333333?r   r   r  horizontal_thresholdfloatvertical_thresholdc                B   t          |           s,t          |          st          j        d|  d| d           dS |j        d         d         }|j        d         d         }|j        d         d         }|j        d         d         }||z
  }||z
  }	| j        d         d         |||z  z
  k    o3| j        d         d         |||z  z   k     o| j        d         d         |k    }
| j        d         d         |||	z  z   k     o| j        d         d         |||	z  z
  k    }|
o|S )a  Checks if the coordinates are within boundary thresholds.
    Parameters
    ----------
    coordinates
        a CoordinatesMetadata input
    boundary
        a CoordinatesMetadata to compare against
    vertical_threshold
        a float ranges from [0,1] to scale the vertical (y-axis) boundary
    horizontal_threshold
        a float ranges from [0,1] to scale the horizontal (x-axis) boundary
    zcoordinates z and boundary z did not pass validationFr   r*  r   )rF   r+   detailr   )r   r  rV  rX  boundary_x_minboundary_x_maxboundary_y_minboundary_y_max
line_widthline_heightx_within_boundaryy_within_boundarys               rX   r  r    sp   $ "+.. 7Mh7W7W X;XXhXXX	
 	
 	
 u_Q'*N_Q'*N_Q'*N_Q'*N.0J >1K 
	A	q	!N6JZ6W$X	X 	9"1%:NQ[:[(\\	9"1%7  	1a >5G+5U#VV[a #n8J[8X&YY  2!22rZ   r   !Union[PDFObjRef, List[PDFObjRef]]r   c                    t          | t                    rt          | |||          S |                                 }|g S t          ||||          S )a  
    Extracts URI annotations from a single or a list of PDF object references on a specific page.
    The type of annots (list or not) depends on the pdf formatting. The function detectes the type
    of annots and then pass on to get_uris_from_annots function as a List.

    Args:
        annots (Union[PDFObjRef, List[PDFObjRef]]): A single or a list of PDF object references
            representing annotations on the page.
        height (float): The height of the page in the specified coordinate system.
        coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent
            the annotations' coordinates.
        page_number (int): The page number from which to extract annotations.

    Returns:
        List[dict]: A list of dictionaries, each containing information about a URI annotation,
        including its coordinates, bounding box, type, URI link, and page number.
    )rz   r   get_uris_from_annotsresolve)r   r   r   r   resolved_annotss        rX   r   r     s[    . &$ T#FF4E{SSSnn&&O	9JKXXXrZ   List[PDFObjRef]Union[int, float]c           	        g }| D ]}t          |          }t          |t                    s(d|v r|d         nd}|r(t          |t                    st	          |          dk    rad|v r|d         nd}|r(t          |t                    st          |          dk    rt          ||          \  }	}
}}|	|
f|	|f||f||
ff}t          ||          }d|vrt          |d                   }t          |t                    sd}d|v r0t          |d         t                    st	          |d                   }d}	 |d	k    r5t          t          |d
                                                 d          }|dk    r5t          t          |d                                                 d          }n# t          $ r Y nw xY w|
                    ||	|
||f|||d           |S )a  
    Extracts URI annotations from a list of PDF object references.

    Args:
        annots (List[PDFObjRef]): A list of PDF object references representing annotations on
            a page.
        height (Union[int, float]): The height of the page in the specified coordinate system.
        coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent
            the annotations' coordinates.
        page_number (int): The page number from which to extract annotations.

    Returns:
        List[dict]: A list of dictionaries, each containing information about a URI annotation,
        including its coordinates, bounding box, type, URI link, and page number.
    SubtypeNz/'Link'Rect   r   ASz/'URI'URIzutf-8z/'GoTo'D)r   r   typer#  r   )try_resolverz   dictr   rO   r   r;   r   decoder   r   )r   r   r   r   r  
annotationannotation_dictsubtyperectr  r  r  r  r   r  uri_dicturi_typer#  s                     rX   re  re    sS   * O -
 -

%j11/400 	09_0L0L/),,RV 	*Wi88 	CLLI<U<U*0O*C*Cv&& 	z$	22 	c$ii1nn%dF33BBr(RHr2hR92$ 
  
  

 o%%s344(D)) 	(??:hsmY#G#G?8C=))H	8##!+huo">">??FFwOO9$$!+hsm"<"<==DDWMM 	 	 	D	 	3RR( * 	
 	
 	
 	
 s   <A6F33
G ?G r  r   c                P    	 |                                  S # t          $ r | cY S w xY w)z
    Attempt to resolve a PDF object reference. If successful, returns the resolved object;
    otherwise, returns the original reference.
    )rf  r   )r  s    rX   rs  rs  Y  s9    
}}   s    %%bbox1!Tuple[float, float, float, float]bbox2c                    | \  }}}}|\  }}}}	t          ||          }
t          ||          }t          ||          }t          ||	          }|
|k     r||k     rt          |
|||f          }|S dS )a  
    Calculate the area of intersection between two bounding boxes.

    Args:
        bbox1 (Tuple[float, float, float, float]): The coordinates of the first bounding box
            in the format (x1, y1, x2, y2).
        bbox2 (Tuple[float, float, float, float]): The coordinates of the second bounding box
            in the format (x1, y1, x2, y2).

    Returns:
        float: The area of intersection between the two bounding boxes. If there is no
        intersection, the function returns 0.0.
    g        )r,  r+  calculate_bbox_area)r}  r  x1_1y1_1x2_1y2_1x1_2y1_2x2_2y2_2x_intersectiony_intersectionx2_intersectiony2_intersectionintersection_areas                  rX   calculate_intersection_arear  d  s    " #D$d"D$dt__Nt__N$ooO$ooO''N_,L,L/^_oN
 
 ! srZ   r   c                *    | \  }}}}||z
  ||z
  z  }|S )a(  
    Calculate the area of a bounding box.

    Args:
        bbox (Tuple[float, float, float, float]): The coordinates of the bounding box
            in the format (x1, y1, x2, y2).

    Returns:
        float: The area of the bounding box, computed as the product of its width and height.
    rs   )r   r  r  r  r  areas         rX   r  r    s(     NBBGR DKrZ   ?r  element_bbox	thresholdc                    g }| D ]W}|d         |k    rIt          |d                   }|r2t          ||d                   |z  |k    r|                    |           X|S )a  
    Filter annotations that are within or highly overlap with a specified element on a page.

    Args:
        annotation_list (List[Dict[str,Any]]): A list of dictionaries, each containing information
            about an annotation.
        element_bbox (Tuple[float, float, float, float]): The bounding box coordinates of the
            specified element in the bbox format (x1, y1, x2, y2).
        page_number (int): The page number to which the annotations and element belong.
        threshold (float, optional): The threshold value (between 0.0 and 1.0) that determines
            the minimum overlap required for an annotation to be considered within the element.
            Default is 0.9.

    Returns:
        List[Dict[str,Any]]: A list of dictionaries containing information about annotations
        that are within or highly overlap with the specified element on the given page, based on
        the specified threshold.
    r   r   )r  r  r   )r  r  r   r  r	  rv  annotation_bbox_sizes          rX   r   r     s    0 "$% > >
m$33#6z&7I#J#J # >+L*V:LMMPdd  +11*===%%rZ   r  r   )Tuple[List[LTChar], List[Dict[str, Any]]]c           	        g }g }d}| D ]`}d}d\  }}}	}
d}t          |          D ]/\  }}t          |t                    r|                    |           |                                }|r4|                                s |                    ||||	|
f|d           d}{|s|                                }|rK|                                |k    r3|                                }|                    ||||	|
f|d           d}t          |          dk    r(||z   }|j        }||j	        z
  }
|j
        }	||j        z
  }n|j
        }	||j	        z
  }
||z  }1|t          |          z  }b||fS )a|  
    Extracts characters and word bounding boxes from a PDF text element.

    Args:
        obj (LTTextBox): The PDF text element from which to extract characters and words.
        height (float): The height of the page in the specified coordinate system.

    Returns:
        Tuple[List[LTChar], List[Dict[str,Any]]]: A tuple containing two lists:
            - List[LTChar]: A list of LTChar objects representing individual characters.
            - List[Dict[str,Any]]]: A list of dictionaries, each containing information about
                a word, including its text, bounding box, and start index in the element's text.
    r   r[   )NNNN)r   r   r$  )r   rz   r   r   r   r   isalnumr   x0y0r  r  )r  r   
charactersr  text_len	text_linewordr  r  r  r  r$  index	charactercharr  s                   rX   r   r     s   " JEH &# &#	/BB )) 4 4 !	 !	E9)V,,  !!),,, ))++ 

 LL!%BB/?P[\\   D  -"llnnG DLLNNg55"llnnGLL!%BB/?P[\\   Dt99>>"*U"2K"B),.B"B),.BB"B),.BC	NN"urZ   r  Dict[str, Any]c                "   t          |           dk    rd|d<   d|d<   |S t          j        |d         d         t          j        d | D                       z
  dz  |d         d	         t          j        d
 | D                       z
  dz  z             }t          j        |d         d         t          j        d | D                       z
  dz  |d         d         t          j        d | D                       z
  dz  z             }t	          |          }t	          |          }d}||k    r-t          ||d	z             D ]}|dz  }|| |         d         z  }n| |         d         }|                                |d<   | |         d         |d<   |S )aq  
    Maps a bounding box annotation to the corresponding text and start index within a list of words.

    Args:
        words (List[Dict[str,Any]]): A list of dictionaries, each containing information about
            a word, including its text, bounding box, and start index.
        annot (Dict[str,Any]): The annotation dictionary to be mapped, which will be updated with
        "text" and "start_index" fields.

    Returns:
        dict: The updated annotation dictionary with "text" representing the mapped text and
            "start_index" representing the start index of the mapped text in the list of words.
    r   r[   r   r   r$  r   c                *    g | ]}|d          d         S )r   r   rs   r   r  s     rX   
<listcomp>z&map_bbox_and_index.<locals>.<listcomp>       %H%H%H$d6l1o%H%H%HrZ   r*  r   c                *    g | ]}|d          d         S )r   r   rs   r  s     rX   r  z&map_bbox_and_index.<locals>.<listcomp>       'J'J'JDVQ'J'J'JrZ   c                *    g | ]}|d          d         S )r   r*  rs   r  s     rX   r  z&map_bbox_and_index.<locals>.<listcomp>  r  rZ      c                *    g | ]}|d          d         S )r   r  rs   r  s     rX   r  z&map_bbox_and_index.<locals>.<listcomp>  r  rZ   r   )r   npsqrtarray
try_argminr8  r   )r  r  distance_from_bbox_startdistance_from_bbox_endclosest_startclosest_endr   r
  s           rX   r   r     s    5zzQf!m!w	vq	BH%H%H%%H%H%HII	IaO=bh'J'JE'J'J'JKKKPQ
Q	R     W	vq	BH%H%H%%H%H%HII	IaO=bh'J'JE'J'J'JKKKPQ
Q	R  788M344K Dm##}kAo66 	% 	%ACKDE!HV$$DD	% ]#F+JJLLE&M />E-LrZ   r  c                h    	 t          t          j        |                     S # t          $ r Y dS w xY w)a;  
    Attempt to find the index of the minimum value in a NumPy array.

    Args:
        array (np.ndarray): The NumPy array in which to find the minimum value's index.

    Returns:
        int: The index of the minimum value in the array. If the array is empty or an
        IndexError occurs, it returns -1.
    r   )r/  r  argminr(  )r  s    rX   r  r  $  sA    29U##$$$   rrs    # 
11)rL   rM   rN   rO   )$r\   rO   r]   r^   r_   rM   r`   rO   rL   rM   ra   rb   rc   rd   re   rM   rf   rb   rg   rb   rh   rb   ri   rj   rk   rb   rl   rM   rm   rd   rn   rb   ro   rM   rN   rp   )r[   NFNN)r\   rO   r]   rw   r_   rM   rc   rd   rg   rb   ru   r   )Nr[   )r]   r   r\   rb   rN   r   )&r\   rO   r]   r   r   rM   rL   rM   r_   rM   rc   rd   r   rO   r   rb   rk   rb   r   r   rg   rb   r   rM   rl   rM   rm   rd   rn   rb   ro   rM   r   rM   r   rb   rN   rp   )r\   rO   r]   r   r   rM   r_   rM   r`   rO   rL   rM   ra   rb   rc   rd   rg   rb   rk   rb   rl   rM   rm   rd   rn   rb   ro   rM   rN   rp   )r   rp   )r\   rO   r]   r   r_   rM   rc   r   rg   rb   ru   r   rN   rp   )r   r   rN   rO   )r   r   r\   rO   r_   rM   rc   r   rg   rb   r   rO   )r   rp   r   r  rN   rp   )r  r  r  r   rN   r!  )r  r    r  r    r   r  rN   r    )r[   Nr-  )r\   rO   r]   rw   r.  r/  rN   r0  )r\   rO   r]   rw   r_   rM   rc   rd   r   rM   rg   rb   )rA  rE  rc   rd   r   r/  r_   rM   rg   rb   r   rO   rN   rp   )rT  rU  )
r   r   r  r   rV  rW  rX  rW  rN   rM   )
r   rc  r   rW  r   r  r   r/  rN   r  )
r   rh  r   ri  r   r  r   r/  rN   r  )r  r   )r}  r~  r  r~  rN   rW  )r   r~  rN   rW  )r  )
r  r  r  r~  r   r/  r  rW  rN   r  )r  r   r   rW  rN   r  )r  r  r  r  )r  r   rN   r/  )
__future__r   r&  r|   rT   r   r   tempfiler   typingr   r   r   r   r	   r
   r   r   r   r   r   r   numpyr  r5  wraptr   r   pdfminer.layoutr   r   r   r   r   pdfminer.pdftypesr   pdfminer.utilsr   PILr   rE  unstructured.chunkingr   unstructured.cleaners.corer   r   "unstructured.documents.coordinatesr   r   unstructured.documents.elementsr   r    r!   r"   r#   r$   r%   r&   r'    unstructured.file_utils.filetyper(   r)   unstructured.loggerr*   r+   unstructured.nlp.patternsr,   unstructured.partition.commonr-   r.   r/   r0   r1   r2   r3   unstructured.partition.langr4   r5   0unstructured.partition.pdf_image.pdf_image_utilsr6   r7   r8   r   r9   /unstructured.partition.pdf_image.pdfminer_utilsr:   r;   !unstructured.partition.strategiesr<   r=   unstructured.partition.textr>   &unstructured.partition.utils.constantsr?   r@   rA   rB   rC   rD   0unstructured.partition.utils.processing_elementsrE   $unstructured.partition.utils.sortingrF   rG   unstructured.patches.pdfminerrH   unstructured.utilsrI   PSBaseParser_parse_keywordcompileDOTALLr   rY   PDFAUTOrv   r   r   	FULL_PAGEvaluer   rt   r   r~   r   patch_function_wrapperr   r   r   r   r  rB  r   rG  r  r   re  rs  r  r  r   r   r   r  rs   rZ   rX   <module>r     s   " " " " " "     				 				 				  ) ) ) ) ) )                                                        ( ' ' ' ' ' ( ( ( ( ( ( ! ! ! ! ! ! 7 7 7 7 7 7        F E E E E E E E                               5 4 4 4 4 4 4 4 7 7 7 7 7 7                                 
             a ` ` ` ` ` ` ` 9 9 9 9 9 9                [ Z Z Z Z Z        8 7 7 7 7 7 4 4 4 4 4 4 	 (5  $#-2:fBI#N#N#N  E E E E HL))<@ %%*"'#'%)!'+,0'+'+"'15+/$#Z Z Z Z  *) Z| .2 %%),0    * DH 	" 	" 	" 	" 	" /00-1"' %%)%+ $'+#',0!&"'15+/$48%S S S S 10Sn CG %%*"'#'%),0'+"'15+/$f f f f fR   " z#455+ + + 65+\   . 24WXX  YX &V V V V Vr   8   (   : .2    D 13KLL.2 %&+W,0* * * * ML*^ &* %,0%1 1 1 1 1n #& #	)3 )3 )3 )3 )3XY Y Y Y>D D D DN      D   ( 	!& !& !& !& !&H< < < <~( ( ( (V     rZ   