
    je                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl mZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ erd dlm Z  d	Z! ed
          Z" ed          Z#deee$                  de$fdZ% G d dee"                   Z&dee         de$ddfdZ'de$dee         fdZ(	 d?dee$ee$         f         dee$         deee#e"f         gee#e"f         f         fdZ)de$fdZ*d?dee$         fdZ+dedeeef         fdZ,dedefdZ-dedefdZ.d  Z/d!e$d"e0defd#Z1d$e$d%e$d"e0de2effd&Z3d$e$d%e$de2ee$ffd'Z4	 d@d)eeef         d*eeef         d+e2de5fd,Z6	 dAd.eeef         d/eeef         d0e$fd1Z7	 dBd3eeeeef                  eeeef                  f         d4eee$         ee$         f         d5eee$         ee$         f         d6eee$         ee$         f         d7e2f
d8Z8	 	 dCd3eeeeef                  eeeef                  f         d4eee$         ee$         f         d5eee$         ee$         f         d:e0d7e2f
d;Z9	 	 dCd<ed=         d:e0d7e2de5ee         ffd>Z:dS )D    N)datetime)wraps)combinations)TYPE_CHECKINGAnyCallableDictGenericIterableIteratorListOptionalSequenceTupleTypeVarUnioncast)	ParamSpec)__version__)Text)z%Y-%m-%dz%Y-%m-%dT%H:%M:%Sz%Y-%m-%d+%H:%M:%Sz%Y-%m-%dT%H:%M:%S%z_T_Pmatrixreturnc                 
   dt           t           t                            dt          t                   ffd}dt           t                   dt          t                   fd| r"dd                     ||                      dndS )	a  Form an HTML table from "rows" and "columns" of `matrix`.

    Character overhead is minimized:
    - No whitespace padding is added for human readability
    - No newlines ("
") are added
    - No `<thead>`, `<tbody>`, or `<tfoot>` elements are used; we can't tell where those might be
      semantically appropriate anyway so at best they would consume unnecessary space and at worst
      would be misleading.
    rows_of_cell_strsr   c              3   d   K   | D ])}|sdd                      |                     dV  *d S )Nz<tr> z</tr>)join)r   row_cell_strsiter_tdss     \/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/utils.pyiter_trsz.htmlify_matrix_of_cell_texts.<locals>.iter_trs5   s_      . 	A 	AM  @-!8!899@@@@@@@		A 	A    r    c              3      K   | D ]X}t          j        |          }d                    |                    d                    }d|                                 dV  Yd S )Nz<br/>
z<td>z</td>)htmlescaper   splitstrip)r    ss     r"   r!   z.htmlify_matrix_of_cell_texts.<locals>.iter_tds<   si       	* 	*AAAQWWT]]++A))))))))	* 	*r$   z<table>r   z</table>)r   strr   r   )r   r#   r!   s     @r"   htmlify_matrix_of_cell_textsr-   *   s    AHXc]$; A A A A A A A* *(3- * * * * =CJ8RWWXXf--..8888Jr$   c                   \    e Zd ZdZdedef         ddfdZddededefd	Zded
eddfdZ	dS )lazypropertya$  Decorator like @property, but evaluated only on first access.

    Like @property, this can only be used to decorate methods having only a `self` parameter, and
    is accessed like an attribute on an instance, i.e. trailing parentheses are not used. Unlike
    @property, the decorated method is only evaluated on first access; the resulting value is
    cached and that same value returned on second and later access without re-evaluation of the
    method.

    Like @property, this class produces a *data descriptor* object, which is stored in the __dict__
    of the *class* under the name of the decorated method ('fget' nominally). The cached value is
    stored in the __dict__ of the *instance* under that same name.

    Because it is a data descriptor (as opposed to a *non-data descriptor*), its `__get__()` method
    is executed on each access of the decorated attribute; the __dict__ item of the same name is
    "shadowed" by the descriptor.

    While this may represent a performance improvement over a property, its greater benefit may be
    its other characteristics. One common use is to construct collaborator objects, removing that
    "real work" from the constructor, while still only executing once. It also de-couples client
    code from any sequencing considerations; if it's accessed from more than one location, it's
    assured it will be ready whenever needed.

    Loosely based on: https://stackoverflow.com/a/6849299/1902513.

    A lazyproperty is read-only. There is no counterpart to the optional "setter" (or deleter)
    behavior of an @property. This is critically important to maintaining its immutability and
    idempotence guarantees. Attempting to assign to a lazyproperty raises AttributeError
    unconditionally.

    The parameter names in the methods below correspond to this usage example::

        class Obj(object)

            @lazyproperty
            def fget(self):
                return 'some result'

        obj = Obj()

    Not suitable for wrapping a function (as opposed to a method) because it is not callable.
    fget.r   Nc                 V    || _         |j        | _        t          j        | |           dS )aY  *fget* is the decorated method (a "getter" function).

        A lazyproperty is read-only, so there is only an *fget* function (a regular
        @property can also have an fset and fdel function). This name was chosen for
        consistency with Python's `property` class which uses this name for the
        corresponding parameter.
        N)_fget__name___name	functoolsupdate_wrapper)selfr0   s     r"   __init__zlazyproperty.__init__s   s-     
]
 t,,,,,r$   objtypec                     || S |j                             | j                  }|$|                     |          }||j         | j        <   t	          t
          |          S )a  Called on each access of 'fget' attribute on class or instance.

        *self* is this instance of a lazyproperty descriptor "wrapping" the property
        method it decorates (`fget`, nominally).

        *obj* is the "host" object instance when the attribute is accessed from an
        object instance, e.g. `obj = Obj(); obj.fget`. *obj* is None when accessed on
        the class, e.g. `Obj.fget`.

        *type* is the class hosting the decorated getter method (`fget`) on both class
        and instance attribute access.
        )__dict__getr4   r2   r   r   )r7   r9   r:   values       r"   __get__zlazyproperty.__get__   sY     ;K   ,,= JJsOOE',CL$Br$   r>   c                      t          d          )a  Raises unconditionally, to preserve read-only behavior.

        This decorator is intended to implement immutable (and idempotent) object
        attributes. For that reason, assignment to this property must be explicitly
        prevented.

        If this __set__ method was not present, this descriptor would become a
        *non-data descriptor*. That would be nice because the cached value would be
        accessed directly once set (__dict__ attrs have precedence over non-data
        descriptors on instance attribute lookup). The problem is, there would be
        nothing to stop assignment to the cached value, which would overwrite the result
        of `fget()` and break both the immutability and idempotence guarantees of this
        decorator.

        The performance with this __set__() method in place was roughly 0.4 usec per
        access when measured on a 2.8GHz development machine; so quite snappy and
        probably not a rich target for optimization efforts.
        zcan't set attribute)AttributeError)r7   r9   r>   s      r"   __set__zlazyproperty.__set__   s    & 2333r$   N)
r3   
__module____qualname____doc__r   r   r8   r   r?   rB    r$   r"   r/   r/   H   s        ( (T-Xc2g. -4 - - - - 3 c R    :43 4s 4t 4 4 4 4 4 4r$   r/   datafilenamec                     t          |d          5 }|                    d | D                        d d d            d S # 1 swxY w Y   d S )Nzw+c              3   D   K   | ]}t          j        |          d z   V  dS )r&   N)jsondumps).0datums     r"   	<genexpr>z save_as_jsonl.<locals>.<genexpr>   s1      JJEtz%0047JJJJJJr$   )open
writelines)rH   rI   output_files      r"   save_as_jsonlrT      s    	h		 KJJTJJJJJJK K K K K K K K K K K K K K K K K Ks    >AAc                 l    t          |           5 }d |D             cd d d            S # 1 swxY w Y   d S )Nc                 6    g | ]}t          j        |          S rG   )rL   loads)rN   lines     r"   
<listcomp>z#read_from_jsonl.<locals>.<listcomp>   s"    888T
4  888r$   )rQ   )rI   
input_files     r"   read_from_jsonlr[      s    	h 9:88Z8889 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9s   )--dependenciesextrasc                      t           t                    r g dt          t          t          f         dt          t          t          f         f fd}|S )Nfuncr   c                 t     t                     dt          j        dt          j        f fd            }|S )Nargskwargsc                      g }D ]&}t          |          s|                    |           't          |          dk    rHt          dd                    |           drd dndd                    |           d	z              | i |S )
Nr   z$Following dependencies are missing: z, . z5Please install them using `pip install "unstructured[z]"`.z'Please install them using `pip install  z`.)dependency_existsappendlenImportErrorr   )ra   rb   missing_depsdepr\   r]   r_   s       r"   wrapperz9requires_dependencies.<locals>.decorator.<locals>.wrapper   s    &(L# - -(-- - '',,,<  1$$!V499\;R;RVVV "b`RX````asxxP\G]G]aaa	   4((((r$   )r   r   ra   rb   )r_   rl   r\   r]   s   ` r"   	decoratorz(requires_dependencies.<locals>.decorator   sU    	t	)27 	)bi 	) 	) 	) 	) 	) 	) 	) 
	)  r$   )
isinstancer,   r   r   r   )r\   r]   rm   s   `` r"   requires_dependenciesro      sj     ,$$ &$~R( Xb"f-=       ( r$   
dependencyc                     	 t          j        |            n.# t          $ r!}| t          |          v rY d }~dS Y d }~nd }~ww xY wdS )NFT)	importlibimport_moduleri   repr)rp   es     r"   rf   rf      sp    
++++   a  55555 !     4s    
A=Adatec                     | st          d          t          D ]*}	 t          j        | |            dS # t           $ r Y 'w xY wt          d|  d          )NzThe argument date is None.TzThe argument zo does not satisfy the format: YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SStz)
ValueErrorDATE_FORMATSr   strptime)rv   formats     r"   validate_date_argsr|      s     75666  	dF+++44 	 	 	D	 	\ 	\ 	\ 	\  s   4
A Aitc                     t          |           }	 t          |          }n# t          $ r t          d          w xY w||fS )NzYExpected at least 1 element in iterable from which to retrieve first, got empty iterable.)iternextStopIterationrx   )r}   iteratorouts      r"   _first_and_remaining_iteratorr      s]    BxxH
8nn 
 
 

 
 	


 =s   ! ;c                 *    t          |           \  }}|S )zRReturns the first item from an iterable. Raises an error if the iterable is empty.)r   )r}   r   _s      r"   firstr     s    *2..FCJr$   c                 z    t          |           \  }}t          d |D                       rt          d          |S )zkReturns the only element from a singleton iterable. Raises an error if the iterable is not a
    singleton.c              3      K   | ]}d V  dS )TNrG   rN   r   s     r"   rP   zonly.<locals>.<genexpr>  s"      
"
"A4
"
"
"
"
"
"r$   zRExpected only 1 element in passed argument, instead there are at least 2 elements.)r   anyrx   )r}   r   r   s      r"   onlyr     sP     2"55MC

"
"
"
"
""" 
`
 
 	
 Jr$   c                     	 t          j        d           d} n# t          $ r d} Y nw xY wd                    t	          j                                        d          d d                   }	 t          j        d          dk    rt          j        d          dk    rd	t          v rht          j        d
t          z   dz   t	          j                    z   dz   |z   dz   t	          j                    z   dz   t          |           z   dz              d S t          j        d
t          z   dz   t	          j                    z   dz   |z   dz   t	          j                    z   dz   t          |           z   dz              d S d S d S # t          $ r Y d S w xY w)Nz
nvidia-smiTF.   SCARF_NO_ANALYTICStrueDO_NOT_TRACKdevz:https://packages.unstructured.io/python-telemetry?version=z
&platform=z&pythonz&arch=z&gpu=z	&dev=truez
&dev=false)
subprocesscheck_output	Exceptionr   platformpython_versionr)   osgetenvr   requestsr=   systemmachiner,   )gpu_presentr   s     r"   scarf_analyticsr     s   ---    XXh577==cBB2A2FGGN9)**f44>9R9RV\9\9\##P!""# o''(  	 
 %%  &(()  +&&	' "
"     P!""# o''(  	 
 %%  &(()  +&&	' #
#    ! 549\9\:    s&    ((-BE: A&E: :
FFr+   nc                     g }t          t          |           |z
  dz             D ]V}g }t          |          D ] }|                    | ||z                       !|                    t          |                     W|S )z Generate n-grams from a string s   )rangerh   rg   tuple)r+   r   ngrams_listingramjs         r"   ngramsr   A  s     K3q66A:>"" ) )q 	# 	#ALL1q5""""5<<((((r$   first_stringsecond_stringc                 (   |sdi fS t          |                                 |          }t          |                                |          }|sdS t          |          t          |          z  }t          |          t          |          z  dz  }||fS )zCalculate the percentage of common_ngrams between string_A and string_B
    with reference to the total number of ngrams in string_Ar   d   )r   r)   setrh   )r   r   r   first_string_ngramssecond_string_ngramscommon_ngrams
percentages          r"   !calculate_shared_ngram_percentager   M  s      "u !3!3!5!5q99!-"5"5"7"7;; q+,,s3G/H/HHMm$$s+>'?'??3FJ}$$r$   c                    g }t          |                                           t          |                                          k     r%t          |                                           dz
  }n(t          |                                          dz
  }|| }} d}|s"t          | ||          \  }}|dk    rn|dz  }|"t          |d          |t	          |dz             fS )ziIteratively calculate_shared_ngram_percentage starting from the biggest
    ngram possible until is >0.0%r   r   r   )rh   r)   r   roundr,   )r   r   shared_ngramsr   ngram_percentages        r"   "calculate_largest_ngram_percentager   b  s     M
<  3}':':'<'<#=#===""$$%%)##%%&&*&3\m 	*K+
 +
'-
 66FA  	 !1%%}c!a%jj@@r$           parent_targetchild_targetaddc                 d   t          |           dk    rdS |rbt          |           dk    rOt          |           } | dxx         |z  cc<   | dxx         |z  cc<   | dxx         |z  cc<   | dxx         |z  cc<   t          |          dk    rJ|d         | d         k    r8|d         | d         k    r&|d         | d         k    r|d         | d         k    rdS t          |          dk    rF| d         |d         cxk    r| d         k    r'n n$| d         |d         cxk    r| d         k    rn ndS dS )zTrue if the child_target bounding box is nested in the parent_target.
    Box format: [x_bottom_left, y_bottom_left, x_top_right, y_top_right].
    The parameter 'add' is the pixel error tolerance for extra pixels outside the parent region   Fr   r   r      T)rh   list)r   r   r   s      r"   is_parent_boxr   z  s    =Qu
  s=!!Q&&]++aCaCaCaC 	LQ!_a 000\!_VWHX5X5X!_a 000\!_VWHX5X5Xt
<AaLO????}Q/??????!QCCCC=3CCCCCCt5r$   totalbox1box2intersection_ratio_methodc                 T   | d         \  }}| d         \  }}|d         \  }}|d         \  }	}
||z
  ||z
  z  }|	|z
  |
|z
  z  }t          ||          }t          ||          }t          ||	          }t          ||
          }t          d||z
            t          d||z
            z  }t          ||          }t          ||          }||z   }|dk    r|dk    rdS ||z  dz  }n0|dk    r|dk    rdS ||z  dz  }n||z   dk    rdS |||z   |z
  z  dz  }t          |d          |||fS )af  Box format: [x_bottom_left, y_bottom_left, x_top_right, y_top_right].
    Calculates the percentage of overlapped region with reference to
    the biggest element-region (intersection_ratio_method="parent"),
    the smallest element-region (intersection_ratio_method="partial"), or to
    the disjunctive union region (intersection_ratio_method="total")
    r   r   parentr   partial)maxminr   )r   r   r   x1y1x2y2x3y3x4y4	area_box1	area_box2x_intersection1y_intersection1x_intersection2y_intersection2intersection_areamax_areamin_area
total_areaoverlap_percentages                         r"   calculate_overlap_percentager     s    !WFB!WFB!WFB!WFBbR"W%IbR"W%I"bkkO"bkkO"bkkO"bkkOA@AAC	/)E E  9i((H9i((HY&J H,,q==1/(:cA	"i	/	/q==1/(:cA 	!a''1/9y3HK\3\]add#Q''8ZGGr$         $@box_pair
label_pair	text_pairix_pairsm_overlap_thresholdc                    d\  }}}}| \  }	}
|\  }}|\  }}|\  }}t          |	|
d          \  }}}}||k     r| d| d| d| dg}d}n|s| d| d| d| dg}d| }n|s| d| d| d| dg}d| }n||v s||v r| d| d| d| dg}d}nt          ||          \  }}}t          |d	          }|s| d| d| d| dg}d
}nl| d| d| d| dg}t          |                                          t          |                                          k     r|n|}d|z   d| dz   }d| d| }|||||||fS )ax  Classifies the overlapping case for an element_pair input.
    There are 5 categories of overlapping:
    'Small partial overlap', 'Partial overlap with empty content',
    'Partial overlap with duplicate text (sharing 100% of the text)',
    'Partial overlap without sharing text', and
    'Partial overlap sharing {calculate_largest_ngram_percentage(...)}% of the text'
    )NNNNr   r   (ix=)zSmall partial overlapz&partial overlap with empty content in z#partial overlap with duplicate textr   z$partial overlap without sharing textzof the text from(z-gram)zpartial overlap sharing z% )r   r   r   rh   r)   )r   r   r   r   r   overlapping_elementsoverlapping_caser   largest_ngram_percentager   r   type1type2text1text2ix_element1ix_element2r   r   r   largest_shared_ngrams_max	largest_nref_types                          r"   identify_overlapping_caser     s   \X*,>@X JD$LE5LE5&K;W"+< < <88Z
 000((+(((((+((( 
 3  *	e,,k,,,,,k,,,$   POO #	e,,k,,,,,k,,,$   POOe^^u~~,,k,,,,,k,,,$   E 35%@@	()',-Eq'I'I$+ e00+00000+000($ $J   00+00000+000($ %($6$6U[[]]9K9K$K$K55QV-8;Py;P;P;PP#d>V#d#dZb#d#d   r$      nested_error_tolerance_pxc                 &   | \  }}|\  }}d                     d |D                       }	d                     d |D                       }
|dd                                         }|dd                                         }|d         \  }}|d         \  }}|d         \  }}|d         \  }}||||g}||||g}||k     o||k    }||k     o||k    }d\  }}}}}d	\  }}}|r|rt          ||d
          \  }}}}t          ||d          \  }}}}t          |||          r| d|	 d| d|
 dg}d| d| }d}nKt          |||          r| d|
 d| d|	 dg}d| d| }d}nt	          | |||	|
f|          \  }}}}}}}||||||||fS )zIdentify if there are nested or overlapping elements. If overlapping is present,
    it identifies the case calling the method identify_overlapping_caser   c                 :    g | ]}|                                 |S rG   	isnumericrN   chs     r"   rY   z8identify_overlapping_or_nesting_case.<locals>.<listcomp>2  %    @@@"@2@@@r$   c                 :    g | ]}|                                 |S rG   r   r   s     r"   rY   z8identify_overlapping_or_nesting_case.<locals>.<listcomp>3  r   r$   r   Nr   r   )NNNNN)NNNr   r   r   )r   r   r   znested z in r   )r   )r   r*   r   r   r   ) r   r   r   r   r   r   r   r   r   r   r   x_bottom_left_1y_bottom_left_1x_top_right_1y_top_right_1x_bottom_left_2y_bottom_left_2x_top_right_2y_top_right_2box1_cornersbox2_cornershorizontal_overlapvertical_overlapr   r   r   overlap_percentage_totalr   r   r   r   r   s                                    r"   $identify_overlapping_or_nesting_caser  '  s    JD$LE5''@@@@@AAK''@@@@@AAK!""IOOE!""IOOE'+Aw$O_#'7 M='+Aw$O_#'7 M=#_m]SL#_m]SL(=8\]_=\&6Z=?;Z	   &6"Hh
 +. +,H&--
 -
 -
) !Q
 >Z&.>
 >
 >
:Hh
 |9RSSS 	,,k,,,,,k,,,$   <;;E;;!$<;TUUU 	,,k,,,,,k,,,$   <;;E;;!$ *k*%9  $ "( 	  	 	r$   elementsr   c                 <   | d         j         j        }d t          |          D             }d t          |          D             }d t          |          D             }t          |           D ]\  }}|j         j        dz
  }	||	                             |j         j                                        d                    ||	                             | d|j                    ||	                             |j                   d}
g }t          t          |||          d	          D ]\  }\  }}}t          t          |d
                    }t          t          |d
                    }t          t          |d
                    }t          |||          D ]\  }}}t          |||||          \  }}}}}}}}|r[|                    ||| d|| dt          |d
           dt          |d
           dt          |d
           ddd           d}
|
|fS )zLCatch overlapping and nested bounding boxes cases across a list of elements.c                     g | ]}g S rG   rG   r   s     r"   rY   z7catch_overlapping_and_nested_bboxes.<locals>.<listcomp>  s    333Qb333r$   c                     g | ]}g S rG   rG   r   s     r"   rY   z7catch_overlapping_and_nested_bboxes.<locals>.<listcomp>  s    000!2000r$   c                     g | ]}g S rG   rG   r   s     r"   rY   z7catch_overlapping_and_nested_bboxes.<locals>.<listcomp>  s    1111B111r$   r   pointsrd   F)startr   %u   pxˆ2)r   r  r   r   r   )r   r   r   metadataT)r  page_numberr   	enumeraterg   coordinatesto_dictcategorytextzipr   r   r  r   )r  r   r   	num_pagesbounding_boxestext_labelstext_contentixelementn_page_to_ixdocument_with_overlapping_flagoverlapping_casesr  page_bboxespage_labels	page_textpage_bboxes_combinationspage_labels_combinationstext_content_combinationsr   r   r   r   r   r   r  r   r   r   r   s                                 r"   #catch_overlapping_and_nested_bboxesr/    s    %1I33%	"2"2333N00uY//000K11i 0 0111L ** 8 8G'3a7|$++G,<,H,P,P,R,RS[,\]]]L!((B)D)D'2B)D)DEEE\"))',7777%*">GNK66? ? ? -6 -6::k;	 $([!(D(D#E#E #'[!(D(D#E#E $(i)C)C$D$D!/2$$%0
 0
 %	6 %	6+Hj) 5)$ 	$ "((   6!((0D,<1C.F.F.F8P;S8V8V8V+01+=+=(D(D(D+01+=+=(D(D(D-2:q-A-A*H*H*H% %	    26.K%	6N *+<<<r$   rC   )r   )r   )r   )r   r   );r5   r'   rr   rL   r   r   r   r   r   	itertoolsr   typingr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   unstructured.__version__r   unstructured.documents.elementsr   ry   r   r   r,   r-   r/   rT   r[   ro   rf   r|   r   r   r   r   intr   floatr   r   boolr   r   r   r  r/  rG   r$   r"   <module>r8     sx             				                  " " " " " "                               "  ' ' ' ' ' ' 0 0 0 0 0 0 5444444\WT]]Yt__K(3-)@ KS K K K K<j4 j4 j4 j4 j472; j4 j4 j4ZKT
 Kc Kd K K K K
9c 9d4j 9 9 9 9 ! T#Y'SM xB (2r6"223   <#     Xc]    "	h 	5h3G 	 	 	 	h 3    X #    ( ( (V	c 	c 	d 	 	 	 	%%% % T]	% % % %*AS A ARWY]_bQc A A A A6  u%e$ 
 
	   H &--H -H
e
-H
e
-H  #-H -H -H -Hj #'X XDtU{+,eE$+4F.GGHXd3is+,X T#Yc
*+X 49eCj()	X
  X X X X@ &'"&\ \DtU{+,eE$+4F.GGH\d3is+,\ T#Yc
*+\  #	\
  \ \ \ \B &'"&D= D=6lD="D=  D= DJ	D= D= D= D= D= D=r$   