
    j                     Z    d dl Z d dlmZ d dlmZ d dlmZ d dlmZ  G d de	          Z
dS )    N)Language)	Processor)Cleaner)TextSpanc                   .    e Zd Zd	dZd Zd Zd Zd ZdS )
	SegmenterenFNc                     || _         t          j        |          | _        || _        || _        || _        | j        r| j        rt          d          | j        dk    r| j        st          d          dS dS )a  Segments a text into an list of sentences
        with or withour character offsets from original text

        Parameters
        ----------
        language : str, required
            specify a language use its two character ISO 639-1 code,
            by default "en"
        clean : bool, optional
            cleans original text, by default False
        doc_type : [type], optional
            Normal text or OCRed text, by default None
            set to `pdf` for OCRed text
        char_span : bool, optional
            Get start & end character offsets of each sentences
            within original text, by default False
        zWchar_span must be False if clean is True. Since `clean=True` will modify original text.pdfzl`doc_type='pdf'` should have `clean=True` & `char_span` should be False since originaltext will be modified.N)languager   get_language_codelanguage_modulecleandoc_type	char_span
ValueError)selfr   r   r   r   s        Y/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/pysbd/segmenter.py__init__zSegmenter.__init__   s    $ !'9(CC
 ": 	6$. 	6 M N N N ]e##DJ# 5 6 6 6 $###    c                     t          | j        d          r'| j                            || j        | j                  S t          || j        | j                  S )Nr   )r   )hasattrr   r   r   r   texts     r   cleanerzSegmenter.cleaner,   sc    4'33 	O'//d6J9= 0 H H H 4!5NNNNr   c                     t          | j        d          r'| j                            || j        | j                  S t          || j        | j                  S )Nr   )r   )r   r   r   r   r   s     r   	processorzSegmenter.processor3   sk    4'55 	7'11$8L<@N 2 L L L T4#7'+~7 7 7 7r   c           	      N   g }d}|D ]}t          j        d                    t          j        |                    | j                  D ][}|                                }|                                \  }}||k    r(|                    t          |||                     |} n\|S )Nr   z{0}\s*)	refinditerformatescapeoriginal_textgroupspanappendr   )	r   	sentences
sent_spansprior_end_char_idxsentmatch	match_strmatch_start_idxmatch_end_idxs	            r   sentences_with_char_spansz#Segmenter.sentences_with_char_spans;   s     
 	 	DX__RYt__%E%EtGYZZ 
 
!KKMM	16. #555 %% O]KKM M M)6&E 6 r   c                 >   || _         |sg S | j        s| j        dk    r'|                     |                                          }|                     |                                          }|                     |          }| j        r|S | j        r|S d |D             S )Nr   c                     g | ]	}|j         
S  )r*   ).0textspans     r   
<listcomp>z%Segmenter.segment.<locals>.<listcomp>`   s    HHHhHMHHHr   )r#   r   r   r   r   processr/   r   )r   r   postprocessed_sentssentence_w_char_spanss       r   segmentzSegmenter.segmentO   s    ! 	I: 	.%//<<%%++--D"nnT22::<< $ > >?R S S> 	I((Z 	I&& IH2GHHHHr   )r	   FNF)__name__
__module____qualname__r   r   r   r/   r9   r2   r   r   r   r   	   sm        6 6 6 6BO O O7 7 7  (I I I I Ir   r   )r   pysbd.languagesr   pysbd.processorr   pysbd.cleanerr   pysbd.utilsr   objectr   r2   r   r   <module>rB      s    				 $ $ $ $ $ $ % % % % % % ! ! ! ! ! !            WI WI WI WI WI WI WI WI WI WIr   