
    j4              %       <   d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlmZmZ d d	l m!Z!m"Z" d d
l#m$Z$ d dl%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 ddddddddgdd ddddfdee2         deee3                  dee2         dee2         deee2ge2f                  dee2         de4deee2                  dee5         dee5         dee2         dee2         de4dee2         d ed!ee         f d"Z6 e             eej7                   e            ddddddddgdd ddddfdee2         deee3                  dee2         dee2         deee2ge2f                  dee2         de4deee2                  dee5         dee5         dee2         dee2         de4dee2         d ed!ee         f d#                                    Z8de2d!e4fd$Z9	 	 d4de2d%ee	e	e:e:f         d&f                  d'ee         d!efd(Z;	 	 d5d)ee2         dee5         dee5         d!ee2         fd*Z<	 	 d6d+e2dee5         dee5         d!ee2         fd,Z=d-e2d.e5d!ee2         fd/Z>	 d7d-e2dee5         d!ee2         fd0Z?	 d8d-e2d2e2d!ee2         fd3Z@dS )9    N)IOAnyCallableListOptionalTuple)add_chunking_strategy)auto_paragraph_grouperclean_bullets)CoordinateSystem)	AddressElementElementMetadataEmailAddressListItemNarrativeTextTextTitleprocess_metadata)read_txt_file)FileTypeadd_metadata_with_filetype)PARAGRAPH_PATTERNUNICODE_BULLETS_RE)sent_tokenize)exactly_oneget_last_modified_date get_last_modified_date_from_file)apply_lang_metadata)is_bulleted_textis_email_addressis_possible_narrative_textis_possible_numbered_listis_possible_titleis_us_city_state_zipTauto  Ftextfilenamefileencodingparagraph_groupermetadata_filenameinclude_metadata	languagesmax_partitionmin_partitionmetadata_last_modifiedchunking_strategydetect_language_per_elementdetection_originkwargsreturnc                 8    t          d| |||||||||	|
|||d|S )a  Partitions an .txt documents into its constituent paragraph elements.
    If paragraphs are below "min_partition" or above "max_partition" boundaries,
    they are combined or split.
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    text
        The string representation of the .txt document.
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    paragrapher_grouper
        A str -> str function for fixing paragraphs that are interrupted by line breaks
        for formatting purposes.
    include_metadata
        Determines whether or not metadata is included in the output.
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    max_partition
        The maximum number of characters to include in a partition. If None is passed,
        no maximum is applied.
    min_partition
        The minimum number of characters to include in a partition.
    metadata_last_modified
        The day of the last modification
    )r)   r*   r(   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5    )_partition_text)r)   r*   r(   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   s                  e/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/partition/text.pypartition_textr<   +   sU    d  ++)##5+$?)        c                    ||                                 dk    r|s| sg S |	#|!|	|k    s|	dk     s|dk     rt          d          t          | ||           d}d}| $t          | |          \  }}t	          |           }n7|$t          ||          \  }}t          |          }n|t          |          }|du rn| ||          }nt          |          }|	"t          |          |	k     rt          d	          t          ||	|
          }g }|rt          |p| |
p||          }||_        nt                      }|D ]d}|                                 }|rLt          |          s=t          |          }t          j        |          |_        |                    |           et%          t'          |||                    }|S )z!internal API for `partition_text`N r   z6Invalid values for min_partition and/or max_partition.)r)   r*   r(   )r)   r+   )r*   r+   FzB`min_partition` cannot be larger than the length of file contents.)r1   r0   )r)   last_modifiedr/   )elementsr/   r4   )strip
ValueErrorr   r   r   r   strr
   len_split_by_paragraphr   r5   is_empty_bulletelement_from_textcopydeepcopymetadataappendlistr   )r)   r*   r(   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   	file_textlast_modification_datefile_contentrA   rK   ctextelements                         r;   r:   r:   p   sQ   * DJJLLB..t.H.	 	!%]**ma.?.?=STCTCTQRRR 48888I!+XQQQ)!7!A!A		+III)!A$!G!G		II	E!!		&%%i00		*955	 S^^m%C%C]^^^&##  L !H %"&2(0J4J
 
 

 %5!!"$$ % % 	%// 	%'..G#}X66GOOG$$$(C	
 	
 	
 H Or=   c                 P    t          j        |           ot          |           dk    S )z(Checks if input text is an empty bullet.   )r   matchrE   r(   s    r;   rG   rG      s"    #D))<c$ii1n<r=   coordinates.coordinate_systemc                    t          |           r!t          |           }t          |||          S t          |           rt	          |           S t          |           rt          | ||          S t          |           rt          | ||          S t          |           rt          | ||          S t          |           rt          | ||          S t          | ||          S )N)r(   rW   rX   rV   )r    r   r   r!   r   r%   r   r#   r"   r   r$   r   r   )r(   rW   rX   
clean_texts       r;   rH   rH      sL   
  &
"4((
#/
 
 
 	

 
$		 
&&&&	d	#	# 
#/
 
 
 	

 
#4	(	( 
#/
 
 
 	

 
$D	)	) 
#/
 
 
 	

 
4	 	  
#/
 
 
 	
 #/
 
 
 	
r=   split_paragraphsc                    |pd}t          d                    |                     }|p|}g }g }t          |           D ]\  }}||v r
t          |          |k    r|                    |           3|}t          | |dz   d                   D ]O\  }	}
t          |          t          |
          z   dz   |k    r$|                    ||	z   dz              |d|
z   z  }O |                    |           |S )zQCombine paragraphs less than `min_partition` while not exceeding `max_partition`.r    rT   N)rE   join	enumeraterL   )r[   r0   r1   max_possible_partitioncombined_parascombined_idxsiparacombined_paraj	next_paras              r;   !_combine_paragraphs_less_than_minrh      s1    "&QM *:!;!;<<!;%;M "N!M-.. 1 14 t99%%!!$'''' M )*:1q577*C D D  9 }%%I6:mKK!((Q333!S9_4MM!!-0000r=   rN   c                     t          j        t          |                                           }g }|D ]&}|                    t          ||                     't          |||          }|S )zQSplit text into paragraphs that fit within the `min_` and `max_partition` window.)contentr0   )r[   r0   r1   )resplitr   rB   extend_split_content_to_fit_maxrh   )rN   r1   r0   
paragraphsr[   	paragraphcombined_paragraphss          r;   rF   rF   "  s     +Y__->->??J"$ 
 
	%!+  	
 	
 	
 	
 <)##   r=   rj   nc                     g }t          |           |dz  k     rt          t          |                     }nt          j        | |          }|S )zaSplits a section of content into chunks that are at most
    size n without breaking apart words.   )width)rE   rM   _split_in_half_at_breakpointtextwrapwrap)rj   rr   segmentss      r;   _split_content_size_nrz   <  sO     H
7||a!e4W==>>=222Or=   c                    t          |           }g }d}|D ]}|ct          |          |k    rP|r|                    |           d}t          ||          }|                    |dd                    |d         }g|1t          |dz   |z             |k    r|                    |           |}|s|}|d|z   z  }|                                }|r|                    |           |S )zkSplits a paragraph or section of content so that all of the elements fit into the
    max partition window.r?   N)rr   r]   )r   rE   rL   rz   rm   rB   )rj   r0   	sentenceschunks	tmp_chunksentencery   s          r;   rn   rn   G  s#    g&&IFI 2 2 $X)F)F i(((	,XGGGHMM(3B3-((( II
 (SS81K-L-L}-\-\i((($		 ! 2 (IIx/I ) 1 1II !i   Mr=   r]   
breakpointc                 4   t          |           dz  }t          t          |           dz            D ]/}| ||z            |k    r||z  } n| ||z
           |k    r|| z  } n0| d|                                         | |d                                         gS )zCSplits a segment of content at the breakpoint closest to the middlert   N)rE   rangerstriplstrip)rj   r   midrc   s       r;   rv   rv   p  s    
 g,,!
C3w<<1$%%  37z))1HCES1W++A2ICE , DSDM  ""GCDDM$8$8$:$:;;r=   )NN)r'   r   )r   r'   )r'   )r]   )ArI   rk   rw   typingr   r   r   r   r   r   unstructured.chunkingr	   unstructured.cleaners.corer
   r   "unstructured.documents.coordinatesr   unstructured.documents.elementsr   r   r   r   r   r   r   r   r    unstructured.file_utils.encodingr    unstructured.file_utils.filetyper   r   unstructured.nlp.patternsr   r   unstructured.nlp.tokenizer   unstructured.partition.commonr   r   r   unstructured.partition.langr    unstructured.partition.text_typer    r!   r"   r#   r$   r%   rD   bytesboolintr<   TXTr:   rG   floatrH   rh   rF   rz   rn   rv   r9   r=   r;   <module>r      s    				  ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; 7 7 7 7 7 7        @ ? ? ? ? ?
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ; : : : : : Q Q Q Q Q Q Q Q K K K K K K K K 3 3 3 3 3 3         
 < ; ; ; ; ;                # $"8<'+!&,X#'#$,0'+(-&,B BsmB
2e9
B 3-B sm	B
  # 45B  }B B S	"B C=B C=B %SMB  }B "&B smB B  
']!B B B BJ HL))" $"8<'+!&,X#'#$,0'+(-&,V VsmV
2e9
V 3-V sm	V
  # 45V  }V V S	"V C=V C=V %SMV  }V "&V smV V  
']!V V V  *) Vr=# =$ = = = = >B48+
 +

+
%eUl 3S 89:+
   01+
 	+
 +
 +
 +
` $(#$   3i C=  C=  
#Y	       J $%#' C= C= 
#Y	   43 3 49     $(& &&C=& 
#Y& & & &V < <<< 
#Y< < < < < <r=   