
    #j2                       d Z ddlmZ ddlZddlZddlmZmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZ ddlmZmZ dd	lmZmZ erdd
lmZmZmZmZ ddlmZ 	 ddlZdZn# e $ r dZY nw xY w	 ddl!m"Z" dZ#n# e $ r dZ#Y nw xY w ej$        e%          Z& edd          Z' G d dee          Z( G d de(          Z) G d de*e          Z+ e	d           G d d                      Z,d dZ-dS )!zText splitter base interface.    )annotationsN)ABCabstractmethod)	dataclass)Enum)TYPE_CHECKINGAnyLiteralTypeVar)BaseDocumentTransformerDocument)Selfoverride)Callable
CollectionIterableSequence)SetTF)PreTrainedTokenizerBaseTSTextSplitter)boundc                      e Zd ZdZddedddfd8dZed9d            Z	 d:d;dZd<dZ	d=d#Z
d>d&Zed?d+            Zed,d e            d-fd@d5            ZedAd7            ZdS )Br   z)Interface for splitting text into chunks.i     FT
chunk_sizeintchunk_overlaplength_functionCallable[[str], int]keep_separatorbool | Literal['start', 'end']add_start_indexboolstrip_whitespacereturnNonec                    |dk    rd| }t          |          |dk     rd| }t          |          ||k    rd| d| d}t          |          || _        || _        || _        || _        || _        || _        dS )aL  Create a new TextSplitter.

        Args:
            chunk_size: Maximum size of chunks to return
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether to keep the separator and where to place it
                in each corresponding chunk `(True='start')`
            add_start_index: If `True`, includes chunk's start index in metadata
            strip_whitespace: If `True`, strips whitespace from the start and end of
                every document
        r   zchunk_size must be > 0, got z chunk_overlap must be >= 0, got zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_function_keep_separator_add_start_index_strip_whitespace)selfr   r   r   r    r"   r$   msgs           g/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/langchain_text_splitters/base.py__init__zTextSplitter.__init__/   s    * ??===CS//!1D]DDCS//!:%%6} 6 66 6 6  S//!%+ /- /!1    textstr	list[str]c                    dS )z$Split text into multiple components.N )r/   r4   s     r1   
split_textzTextSplitter.split_textW   s      r3   Ntexts	metadataslist[dict[Any, Any]] | Nonelist[Document]c           	        |pi gt          |          z  }g }t          |          D ]\  }}d}d}|                     |          D ]}	t          j        ||                   }
| j        rE||z   | j        z
  }|                    |	t          d|                    }||
d<   t          |	          }t          |	|
          }|
                    |           |S )z9Create a list of `Document` objects from a list of texts.r   start_index)page_contentmetadata)len	enumerater9   copydeepcopyr-   r*   findmaxr   append)r/   r:   r;   
metadatas_	documentsir4   indexprevious_chunk_lenchunkrA   offsetnew_docs                r1   create_documentszTextSplitter.create_documents[   s     32$U"3
	 '' 	* 	*GAtE!".. * *=A77( 4"%77$:MMF IIeSF^^<<E.3H]+),U&"III  ))))* r3   rJ   Iterable[Document]c                    g g }}|D ]6}|                     |j                   |                     |j                   7|                     ||          S )zSplit documents.)r;   )rH   r@   rA   rQ   )r/   rJ   r:   r;   docs        r1   split_documentszTextSplitter.split_documentso   sa    ry 	+ 	+CLL)***S\****$$Ui$@@@r3   docs	separator
str | Nonec                j    |                     |          }| j        r|                                }|pd S N)joinr.   strip)r/   rV   rW   r4   s       r1   
_join_docszTextSplitter._join_docsw   s5    ~~d##! 	 ::<<D|tr3   splitsIterable[str]c                   |                      |          }g }g }d}|D ]}|                      |          }||z   t          |          dk    r|ndz   | j        k    r|| j        k    r!t                              d|| j                   t          |          dk    r|                     ||          }	|	|                    |	           || j        k    s,||z   t          |          dk    r|ndz   | j        k    r}|dk    rw||                      |d                   t          |          dk    r|ndz   z  }|dd          }|| j        k    K||z   t          |          dk    r|ndz   | j        k    r|dk    w|                    |           ||t          |          dk    r|ndz   z  }|                     ||          }	|	|                    |	           |S )Nr   zACreated a chunk of size %d, which is longer than the specified %d   )r+   rB   r)   loggerwarningr]   rH   r*   )
r/   r^   rW   separator_lenrV   current_doctotaldlen_rT   s
             r1   _merge_splitszTextSplitter._merge_splits}   s>    --i88!# 	K 	KA((++D[1A1AA1E1E1M"# # 4+++NN'(	   {##a''//+yAACC(((  $"555[9I9IA9M9MSTU*+ +!AII!6!6{1~!F!F-0-=-=-A-AMMq"  '2!""o  $"555[9I9IA9M9MSTU*+ +!AII q!!!Tc+.>.>.B.B]]JJEEook955?KKr3   	tokenizerr   kwargsr	   c                    t           sd}t          |          t          t                    sd}t          |          d	fd} | d
d|i|S )z?Text splitter that uses Hugging Face tokenizer to count length.z`Could not import transformers python package. Please install it with `pip install transformers`.zATokenizer received was not an instance of PreTrainedTokenizerBaser4   r5   r%   r   c                H    t                              |                     S rZ   )rB   tokenizer4   rj   s    r1   _huggingface_tokenizer_lengthzNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_length   s    y))$//000r3   r   r4   r5   r%   r   r8   )_HAS_TRANSFORMERSr(   
isinstancer   )clsrj   rk   r0   rp   s    `   r1   from_huggingface_tokenizerz'TextSplitter.from_huggingface_tokenizer   s    
 ! 	"E  S//!)%<== 	"UCS//!	1 	1 	1 	1 	1 	1 sKK#@KFKKKr3   gpt2allencoding_name
model_nameallowed_special!Literal['all'] | AbstractSet[str]disallowed_special Literal['all'] | Collection[str]r   c                   	 t           sd}t          |          |t          j        |          	nt          j        |          	d
	fd}t          | t                    r||d}i ||} | dd	|i|S )z;Text splitter that uses `tiktoken` encoder to count length.zCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.Nr4   r5   r%   r   c                N    t                              |                     S N)rz   r|   )rB   encode)r4   rz   r|   encs    r1   _tiktoken_encoderz=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder   s4    

$3'9     r3   )rx   ry   rz   r|   r   rq   r8   )_HAS_TIKTOKENImportErrortiktokenencoding_for_modelget_encoding
issubclassTokenTextSplitter)
rt   rx   ry   rz   r|   rk   r0   r   extra_kwargsr   s
      ``    @r1   from_tiktoken_encoderz"TextSplitter.from_tiktoken_encoder   s      	#A 
 c"""!-j99CC'66C	 	 	 	 	 	 	 	 c,-- 	0!.(#2&8	 L 0/,/Fs??#4????r3   Sequence[Document]c                F    |                      t          |                    S )z2Transform sequence of documents by splitting them.)rU   list)r/   rJ   rk   s      r1   transform_documentsz TextSplitter.transform_documents   s    
 ##DOO444r3   )r   r   r   r   r   r   r    r!   r"   r#   r$   r#   r%   r&   r4   r5   r%   r6   rZ   )r:   r6   r;   r<   r%   r=   )rJ   rR   r%   r=   )rV   r6   rW   r5   r%   rX   )r^   r_   rW   r5   r%   r6   )rj   r   rk   r	   r%   r   )rx   r5   ry   rX   rz   r{   r|   r}   rk   r	   r%   r   )rJ   r   rk   r	   r%   r   )__name__
__module____qualname____doc__rB   r2   r   r9   rQ   rU   r]   ri   classmethodru   setr   r   r   r8   r3   r1   r   r   ,   sL       33  039> %!%&2 &2 &2 &2 &2P 3 3 3 ^3 JN    (A A A A   * * * *X L L L [L(  $!%=@SUU?D(@ (@ (@ (@ [(@T 5 5 5 X5 5 5r3   c                  F     e Zd ZdZdd e            dfd fdZddZ xZS )r   z/Splitting text to tokens using model tokenizer.rv   Nrw   rx   r5   ry   rX   rz   r{   r|   r}   rk   r	   r%   r&   c                     t                      j        di | t          sd}t          |          |t	          j        |          }nt	          j        |          }|| _        || _        || _	        dS )zCreate a new TextSplitter.zCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.Nr8   )
superr2   r   r   r   r   r   
_tokenizer_allowed_special_disallowed_special)	r/   rx   ry   rz   r|   rk   r0   r   	__class__s	           r1   r2   zTokenTextSplitter.__init__   s     	""6""" 	#A 
 c"""!-j99CC'66C /#5   r3   r4   r6   c                     d fd}t           j         j         j        j        |          }t          ||          S )	ar  Splits the input text into smaller chunks based on tokenization.

        This method uses a custom tokenizer configuration to encode the input text
        into tokens, processes the tokens in chunks of a specified size with overlap,
        and decodes them back into text chunks. The splitting is performed using the
        `split_text_on_tokens` function.

        Args:
            text: The input text to be split into smaller chunks.

        Returns:
            A list of text chunks, where each chunk is derived from a portion
                of the input text based on the tokenization and chunking rules.
        _textr5   r%   	list[int]c                R    j                             | j        j                  S r   )r   r   r   r   )r   r/   s    r1   _encodez-TokenTextSplitter.split_text.<locals>._encode  s1    ?)) $ 5#'#; *   r3   )r   tokens_per_chunkdecoder   ro   )r   r5   r%   r   )	Tokenizerr*   r)   r   r   split_text_on_tokens)r/   r4   r   rj   s   `   r1   r9   zTokenTextSplitter.split_text  sc     	 	 	 	 	 	 -!-?)	
 
 
	 $CCCCr3   )rx   r5   ry   rX   rz   r{   r|   r}   rk   r	   r%   r&   r   )r   r   r   r   r   r2   r9   __classcell__)r   s   @r1   r   r      sx        99 $!%=@SUU?D6 6 6 6 6 6 64D D D D D D D Dr3   r   c                      e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdS )Languagez"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrrstrubyrustscalaswiftmarkdownlatexhtmlsolcsharpcobolcluaperlhaskellelixir
powershellvisualbasic6N) r   r   r   r   CPPGOJAVAKOTLINJSr   PHPPROTOPYTHONRRSTRUBYRUSTSCALASWIFTMARKDOWNLATEXHTMLSOLCSHARPCOBOLCLUAPERLHASKELLELIXIR
POWERSHELLVISUALBASIC6r8   r3   r1   r   r   /  s        ,,
C	BDF	B	B
CEFA
CDDEEHED
CFEA
CDGFJ!LLLr3   r   )frozenc                  B    e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   d	S )
r   zTokenizer data class.r   r   r   zCallable[[list[int]], str]r   zCallable[[str], list[int]]r   N)r   r   r   r   __annotations__r8   r3   r1   r   r   P  sQ         *,&&&&=&&&&==r3   r   r4   r5   rj   r%   r6   c                   g }|                     |           }d}|j        |j        k    rd}t          |          |t	          |          k     rt          ||j        z   t	          |                    }|||         }|sne|                    |          }|r|                    |           |t	          |          k    rn%||j        |j        z
  z  }|t	          |          k     |S )z6Split incoming text and return chunks using tokenizer.r   z3tokens_per_chunk must be greater than chunk_overlap)r   r   r   r(   rB   minr   rH   )	r4   rj   r^   	input_ids	start_idxr0   cur_idx	chunk_idsdecodeds	            r1   r   r   ^  s    F  &&II!Y%<<<Coo
c)nn
$
$i)"<<c)nnMMi/0	 	""9-- 	#MM'"""c)nn$$Y/)2III	 c)nn
$
$ Mr3   )r4   r5   rj   r   r%   r6   ).r   
__future__r   rD   loggingabcr   r   dataclassesr   enumr   typingr   r	   r
   r   langchain_core.documentsr   r   typing_extensionsr   r   collections.abcr   r   r   r   r   AbstractSetr   r   r   $transformers.tokenization_utils_baser   rr   	getLoggerr   rb   r   r   r   r5   r   r   r   r8   r3   r1   <module>r      s   # # " " " " " "   # # # # # # # # ! ! ! ! ! !                  G F F F F F F F , , , , , , , , 3HHHHHHHHHHHH222222OOOMM   MMMLLLLLL    
	8	$	$WT(((B5 B5 B5 B5 B5*C B5 B5 B5J;D ;D ;D ;D ;D ;D ;D ;D|" " " " "sD " " "B $
> 
> 
> 
> 
> 
> 
> 
>     s$   A A'&A'+A4 4A>=A>