
    j 	                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ	 d dl
mZ  ej        ddd          Z ej        d          Ze                    e                    d                      ej        dd	g
          Z ej        dd          Z e            Zd Zd Zd Zd Zd Zd Zd Zd Z ee          Z d Z!e"dk    rd dl#Z#eeeeeeefZ$e$D ]Z% e#j#                    Z& e'd          D ]Z( e!ee%          Z) e#j#                    e&z
  Z* e+              e+e%j"                    e+d,                    e)                      e+d,                    e*dz  dz                       dS dS )    N)	Tokenizer)GOLDEN_EN_RULESenF)languageclean	char_spansentencizeren_core_web_smner)disabletokenize)lang
processorsc                 P    t          j        |                               d          S )N
)	blingfiretext_to_sentencessplittexts    h/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/benchmarks/benchmark_sbd_tools.pyblingfire_tokenizer      s!    &t,,224888    c                 *    t          j        |           S N)nltksent_tokenizer   s    r   nltk_tokenizer      s    d###r   c                 N    t                               |           }d |D             S )Nc                 6    g | ]}|                                 S  )strip).0ss     r   
<listcomp>z"pysbd_tokenize.<locals>.<listcomp>   s     (((!AGGII(((r   )pysbd_segmentersegment)r   segmentss     r   pysbd_tokenizer)      s)    &&t,,H((x((((r   c                 >    d t          |           j        D             S )Nc                     g | ]	}|j         
S r!   r   r#   sents     r   r%   z"spacy_tokenize.<locals>.<listcomp>!   s    222$DI222r   )nlpsentsr   s    r   spacy_tokenizer0       s    22#d))/2222r   c                 >    d t          |           j        D             S )Nc                     g | ]	}|j         
S r!   r   r,   s     r   r%   z&spacy_dep_tokenize.<locals>.<listcomp>$   s    666$DI666r   )nlp_depr/   r   s    r   spacy_dep_tokenizer4   #   s    66'$--"56666r   c                 >    d t          |           j        D             S )Nc                     g | ]	}|j         
S r!   r   )r#   es     r   r%   z#stanza_tokenize.<locals>.<listcomp>'   s    777qAF777r   )
stanza_nlp	sentencesr   s    r   stanza_tokenizer:   &   s!    77Jt,,67777r   c              #   z   K   | D ]5}d                     d |D                                                       V  6d S )N c              3   4   K   | ]}t          |          V  d S r   )str)r#   tokens     r   	<genexpr>z!make_sentences.<locals>.<genexpr>+   s(      77Uc%jj777777r   )joinr"   )segmented_tokenssentences     r   make_sentencesrD   )   sY      $ @ @gg77h77777==??????@ @r   c                     t                               |           }t          j        t          |                    }d t	          |          D             }|S )Nc                     g | ]}|S r!   r!   r,   s     r   r%   z#syntok_tokenize.<locals>.<listcomp>0   s    888888r   )syntok_tokenizerr   syntok_segmenteriterrD   )r   tokensresultr(   s       r   syntok_tokenizerL   -   sK    ##D))F#DLL11F88!7!7888HOr   c                 d    d}| D ]}|\  }} ||          }||k    r|dz  }|t           z  dz  }|S )Nr      g      Y@)total_rules)golden_rulestokenize_funcscoreruler   expectedr(   percent_scores           r   	benchmarkrV   6   sX    E  h =&&xQJE[(E1Mr   __main__d   zGRS score: {:0.2f}%z&Speed(Avg over 100 runs): {:>10.2f} msi  )-r   r   pysbdspacystanzasyntok.tokenizerr   syntok.segmenter	segmenterrH   english_golden_rulesr   	Segmenterr&   blankr.   add_pipecreate_pipeloadr3   Pipeliner8   rG   r   r   r)   r0   r4   r:   rD   rL   lenrO   rV   __name__time	librariesrQ   trangeirU   
time_takenprintformatr!   r   r   <module>rp      s           & & & & & & + + + + + + 0 0 0 0 0 0!%/4uNNNek$ S__]++ , , ,
%*%w
7
7
7V_$:>>>
9;; 9 9 9$ $ $) ) )3 3 37 7 78 8 8@ @ @   c/""	 	 	 zKKKI # 	T 	TDIKKs 	F 	FA%Io}EEMMTY[[1_
m$%%%#**=99:::6==joc>QRRSSSS' 	T 	Tr   