
    jf                        d Z ddlmZ ddlmZmZmZmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZ ddlmZ 	 	 	 	 dddZ G d de	          ZdS )zdImplementation of chunking by title.

Main entry point is the `@add_chunking_strategy()` decorator.
    )annotations)IteratorListOptionalTuple)BasePreChunkerBoundaryPredicateChunkingOptionsPreChunkCombineris_in_next_sectionis_on_next_pageis_title)Element)lazypropertyTN  elementsList[Element]multipage_sectionsboolcombine_text_under_n_charsOptional[int]new_after_n_charsmax_charactersintreturnc                    t          j        ||||          }t          t                              | |          |                                          }d |D             S )a  Uses title elements to identify sections within the document for chunking.

    Splits off into a new CompositeElement when a title is detected or if metadata changes, which
    happens when page numbers or sections change. Cuts off sections once they have exceeded a
    character length of max_characters.

    Parameters
    ----------
    elements
        A list of unstructured elements. Usually the output of a partition function.
    multipage_sections
        If True, sections can span multiple pages. Defaults to True.
    combine_text_under_n_chars
        Combines elements (for example a series of titles) until a section reaches a length of
        n characters. Defaults to `max_characters` which combines chunks whenever space allows.
        Specifying 0 for this argument suppresses combining of small chunks. Note this value is
        "capped" at the `new_after_n_chars` value since a value higher than that would not change
        this parameter's effect.
    new_after_n_chars
        Cuts off new sections once they reach a length of n characters (soft max). Defaults to
        `max_characters` when not specified, which effectively disables any soft window.
        Specifying 0 for this argument causes each element to appear in a chunk by itself (although
        an element with text longer than `max_characters` will be still be split into two or more
        chunks).
    max_characters
        Chunks elements text and text_as_html (if present) into chunks of length
        n characters (hard max)
    )r   r   r   r   )optsc                @    g | ]}|                                 D ]}|S  )iter_chunks).0	pre_chunkchunks      e/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/chunking/title.py
<listcomp>z"chunk_by_title.<locals>.<listcomp>E   s1    SSSi9;P;P;R;RSS%ESSSS    )r
   newr   _ByTitlePreChunkeriter_pre_chunksiter_combined_pre_chunks)r   r   r   r   r   r   
pre_chunkss          r$   chunk_by_titler,      sy    F #=%-+	  D "**8T::      TS:SSSSr&   c                  *    e Zd ZdZedd            ZdS )r(   zPre-chunker for the "by_title" chunking strategy.

    The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates a
    new "section", hence the "by-title" designation.
    r   Tuple[BoundaryPredicate, ...]c                >     d fd}t           |                      S )zBThe semantic-boundary detectors to be applied to break pre-chunks.r   Iterator[BoundaryPredicate]c               3  z   K   t           V  t                      V   j        j        st	                      V  d S d S )N)r   r   _optsr   r   )selfs   r$   iter_boundary_predicateszI_ByTitlePreChunker._boundary_predicates.<locals>.iter_boundary_predicatesS   sQ      NNN$&&&&&:0 (%'''''''( (r&   )r   r0   )tuple)r3   r4   s   ` r$   _boundary_predicatesz'_ByTitlePreChunker._boundary_predicatesO   s;    	( 	( 	( 	( 	( 	( --//000r&   N)r   r.   )__name__
__module____qualname____doc__r   r6   r   r&   r$   r(   r(   H   s>          	1 	1 	1 \	1 	1 	1r&   r(   )TNNr   )r   r   r   r   r   r   r   r   r   r   r   r   )r:   
__future__r   typingr   r   r   r   unstructured.chunking.baser   r	   r
   r   r   r   r   unstructured.documents.elementsr   unstructured.utilsr   r,   r(   r   r&   r$   <module>r@      s(   
 # " " " " " 2 2 2 2 2 2 2 2 2 2 2 2                  4 3 3 3 3 3 + + + + + +
  $04'+.T .T .T .T .Tb1 1 1 1 1 1 1 1 1 1r&   