
    j                     f    d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
  G d de          ZdS )	    N)Text)ListItemReplacer)ExclamationWords)BetweenPunctuation)AbbreviationReplacerc                   v    e Zd ZddZd Zd Zd Zd Zd Zd Z	d	 Z
d
 Zd Zd Zd Zd Zd Zd Zd Zd ZdS )	ProcessorFc                 0    || _         || _        || _        dS )an  Process a text - do pre and post processing - to get proper sentences

        Parameters
        ----------
        text : str
            Original text
        language : object
            Language module
        char_span : bool, optional
            Get start & end character offsets of each sentences
            within original text, by default False
        N)textlang	char_span)selfr   r   r   s       Y/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/pysbd/processor.py__init__zProcessor.__init__   s     		"    c                 &   | j         s| j         S | j                             dd          | _         t          | j                   }|                                | _         |                                  |                                  |                                  |                                  t          | j                   	                    | j
        j        j        | j
        j        | j
        j                  | _         |                                 }|S )N
)r   replacer   add_line_breakreplace_abbreviationsreplace_numbersreplace_continuous_punctuation)replace_periods_before_numeric_referencesr   applyr   AbbreviationWithMultiplePeriodsAndEmailRuleGeoLocationRuleFileFormatRulesplit_into_segments)r   lipostprocessed_sentss      r   processzProcessor.process   s    y 	9I%%dD11	di((%%''	""$$$++---66888OO))I"BI%ty'?A A	 #6688""r   c                    t          t          d|                    }t          d |D                       s|S g }|D ]G}t          |t                     r|D ]}|                    |           2|                    |           H|S )zRemove None values and unpack list of list sents

        Parameters
        ----------
        sents : list
            list of sentences

        Returns
        -------
        list
            unpacked and None removed list of sents
        Nc              3   @   K   | ]}t          |t                    V  d S N)
isinstancelist).0ss     r   	<genexpr>z,Processor.rm_none_flatten.<locals>.<genexpr>:   s,      661:a&&666666r   )r(   filteranyr'   append)r   sents	new_sentssentr*   s        r   rm_none_flattenzProcessor.rm_none_flatten,   s     VD%(())6666666 	L	 	' 	'D$%% ' ( (A$$Q''''(   &&&&r   c                 H                                        j                            d          }                     |          } fd|D             } fd|D             }                     |          }g }|D ]} t	          |          j         j        j        j         } 	                    |          }|r+t          |t                    r|                    |           jt          |t                    r|D ]}|                    |            fd|D             }|S )Nr   c                 x    g | ]6} t          |          j        j        j        gj        j        j        R  7S  )r   r   r   SingleNewLineRuleEllipsisRulesAllr)   r*   r   s     r   
<listcomp>z1Processor.split_into_segments.<locals>.<listcomp>J   sP     
 
 
 DGGM$)5T	8O8STTT
 
 
r   c                 :    g | ]}                     |          S r5   )check_for_punctuationr9   s     r   r:   z1Processor.split_into_segments.<locals>.<listcomp>N   s'    >>>1++A..>>>r   c                 h    g | ].}t          |                              j        j                  /S r5   )r   r   r   SubSingleQuoteRule)r)   nsr   s     r   r:   z1Processor.split_into_segments.<locals>.<listcomp>Z   s@     > > >#%  $Bxx~~di.JKK > > >r   )check_for_parens_between_quotesr   splitr2   r   r   r   SubSymbolsRulesr8   post_process_segmentsr'   strr.   r(   )r   r/   r"   r1   post_process_sentppss   `     r   r    zProcessor.split_into_segmentsE   sr   ,,...	%%$$U++
 
 
 

 
 
 ?>>>>>>$$U++  	4 	4D#4::#TY%>%BCD $ : :4 @ @  4Z0A3%G%G 4#**+<====-t44 4, 4 4C'..s3333> > > >)<> > >""r   c                    t          |          dk    rt          j        d|          r|S t          j        d|          r	  t	          |          j        | j        j        j         }t          j        | j        j	        |          r!t          j
        | j        j        |          }|S |                    dd          }|                                S )N   z\A[a-zA-Z]*\Zz\tr    )lenresearchmatchr   r   r   ReinsertEllipsisRulesr8   "QUOTATION_AT_END_OF_SENTENCE_REGEXrA   .SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEXr   stripr   txts     r   rC   zProcessor.post_process_segments^   s    s88a<<BI&6<<<J 8E3 	 d3iioty>BC9TYA3GG 	(	H#O OCJ++dB''C99;;r   c                 `    d }t          j        | j        j        || j                  | _        d S )Nc                     |                                  } t          j        dd|           }t          j        dd|          }|S )Nz\s(?=\()r   z	(?<=\))\s)grouprK   subrM   sub1sub2s      r   paren_replacez@Processor.check_for_parens_between_quotes.<locals>.paren_replacey   s:    KKMME6+tU33D6,d33DKr   )rK   rW   r   "PARENS_BETWEEN_DOUBLE_QUOTES_REGEXr   )r   r[   s     r   r@   z)Processor.check_for_parens_between_quotesx   s8    	 	 	
 F49G#TY0 0			r   c                 `    d }t          j        | j        j        || j                  | _        d S )Nc                     |                                  } t          j        t          j        d          d|           }t          j        t          j        d          d|          }|S )N!u   &ᓴ&?u   &ᓷ&)rV   rK   rW   escaperX   s      r   continuous_puncs_replacezJProcessor.replace_continuous_punctuation.<locals>.continuous_puncs_replace   sJ    KKMME6")C..'599D6")C..'488DKr   )rK   rW   r   CONTINUOUS_PUNCTUATION_REGEXr   )r   rb   s     r   r   z(Processor.replace_continuous_punctuation   s8    	 	 	
 F49A0$)= =			r   c                 Z    t          j        | j        j        d| j                  | _        d S )Nu	   ∯\2\r\7)rK   rW   r   NUMBERED_REFERENCE_REGEXr   r   s    r   r   z3Processor.replace_periods_before_numeric_references   s'    F49="DI/ /			r   c                 T    t          j        dd|          }t          |          dk    S )Nz_{3,}rI   r   )rK   rW   rJ   rR   s     r   consecutive_underscorez Processor.consecutive_underscore   s%    fXr3''3xx1}r   c                     t          fd| j        j        D                       r|                               }|S gS )Nc              3       K   | ]}|v V  	d S r&   r5   )r)   prS   s     r   r+   z2Processor.check_for_punctuation.<locals>.<genexpr>   s'      88AqCx888888r   )r-   r   Punctuationsprocess_text)r   rS   r/   s    ` r   r<   zProcessor.check_for_punctuation   sN    8888!788888 	%%c**EL 5Lr   c                    |d         | j         j        vr|dz  }t          j        |          }|                     |          }t          j        | j         j        j        |          s& t          |          j
        | j         j        j         } t          |          j
        | j         j        g| j         j        j        R  }t          |                                          }|                     |          }|S )Nu   ȸ)r   rl   r   apply_rulesbetween_punctuationrK   rM   DoublePunctuationRulesDoublePunctuationr   r   r8   QuestionMarkInQuotationRuleExclamationPointRulesr   replace_parenssentence_boundary_punctuationrR   s     r   rm   zProcessor.process_text   s    r7$)0004KC*3//&&s++x	8JCPP 	I!$s))/49#C#GHCd3iiodiC D#y>BD D Ds##22440055
r   c                 f     t          | j                  j        | j        j        j         | _        d S r&   )r   r   r   r   Numbersr8   rf   s    r   r   zProcessor.replace_numbers   s'    )DOO)49+<+@A			r   c                     t          | j        d          r%| j                            | j        | j                  S t          | j        | j                  S )Nr   )hasattrr   r   r   rf   s    r   abbreviations_replacerz Processor.abbreviations_replacer   sG    49455 	>911$)TYGGG'	49===r   c                 \    |                                                                  | _        d S r&   )r|   r   r   rf   s    r   r   zProcessor.replace_abbreviations   s%    //1199;;			r   c                 ~    t          | j        d          r| j                            |          S t          |          S )Nr   )r{   r   r   rR   s     r   between_punctuation_processorz'Processor.between_punctuation_processor   s;    49233 	+9//444%c***r   c                 T    |                      |                                          }|S r&   )r   r   rR   s     r   rq   zProcessor.between_punctuation   s&    0055==??
r   c                    t          | j        d          r,t          |                              | j        j                  }t          | j        d          r,t          |                              | j        j                  }t          j        dd|          }d t          j        | j        j	        |          D             }|S )NReplaceColonBetweenNumbersRule#ReplaceNonSentenceBoundaryCommaRuleu   &ᓴ&$r_   c                 6    g | ]}|                                 S r5   )rV   )r)   ms     r   r:   z;Processor.sentence_boundary_punctuation.<locals>.<listcomp>   s-       AGGII  r   )
r{   r   r   r   r   r   rK   rW   finditerSENTENCE_BOUNDARY_REGEXrR   s     r   rw   z'Processor.sentence_boundary_punctuation   s    49>?? 	:s))//	8: :C49CDD 	?s))//	=? ?C fYS)) !{49+LcRR   
r   N)F)__name__
__module____qualname__r   r#   r2   r    rC   r@   r   r   rh   r<   rm   r   r|   r   r   rq   rw   r5   r   r   r	   r	   	   s       # # # #"# # #   2# # #2  40 0 0= = =/ / /
  
    B B B> > >< < <+ + +      r   r	   )rK   pysbd.utilsr   pysbd.lists_item_replacerr   pysbd.exclamation_wordsr   pysbd.between_punctuationr   pysbd.abbreviation_replacerr   objectr	   r5   r   r   <module>r      s    				       6 6 6 6 6 6 4 4 4 4 4 4 8 8 8 8 8 8 < < < < < <C C C C C C C C C Cr   