
    jw                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ	  ej
        ddd          Z ej        d          Ze                    e                    d                      ej        ddg	          Z ej        dd
          Z e            Zd Zd Zd Zd Zd Zd Zd Zd Zd Zedk    rd dlZeeeeeeefZ e D ]Z! ej                    Z" e#d          5 Z$e$%                                Z&ddd           n# 1 swxY w Y    ee&e!          Z' ej                    e"z
  Z( e)              e)e!j                    e)d*                    e(dz                       dS dS )    N)	TokenizerenF)languageclean	char_spansentencizeren_core_web_smner)disabletokenize)lang
processorsc                 P    t          j        |                               d          S )N
)	blingfiretext_to_sentencessplittexts    l/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/benchmarks/bigtext_speed_benchmark.pyblingfire_tokenizer      s!    &t,,224888    c                 *    t          j        |           S N)nltksent_tokenizer   s    r   nltk_tokenizer      s    d###r   c                 R    t                               |           }d |D             }|S )Nc                 6    g | ]}|                                 S  )strip).0ss     r   
<listcomp>z"pysbd_tokenize.<locals>.<listcomp>   s     ,,,a		,,,r   )pysbd_segmentersegment)r   segmentss     r   pysbd_tokenizer(      s-    &&t,,H,,8,,,HOr   c                 >    d t          |           j        D             S )Nc                 B    g | ]}|j                             d           S r   r   r!   r"   sents     r   r$   z"spacy_tokenize.<locals>.<listcomp>    s&    >>>dDIOOD!!>>>r   )nlpsentsr   s    r   spacy_tokenizer1      s    >>c$iio>>>>r   c                 >    d t          |           j        D             S )Nc                 B    g | ]}|j                             d           S r+   r,   r-   s     r   r$   z&spacy_dep_tokenize.<locals>.<listcomp>#   s&    BBBdDIOOD!!BBBr   )nlp_depr0   r   s    r   spacy_dep_tokenizer5   "   s    BBgdmm.ABBBBr   c                 >    d t          |           j        D             S )Nc                     g | ]	}|j         
S r    r   )r"   es     r   r$   z#stanza_tokenize.<locals>.<listcomp>&   s    777qAF777r   )
stanza_nlp	sentencesr   s    r   stanza_tokenizer;   %   s!    77Jt,,67777r   c              #   z   K   | D ]5}d                     d |D                                                       V  6d S )N c              3   4   K   | ]}t          |          V  d S r   )str)r"   tokens     r   	<genexpr>z!make_sentences.<locals>.<genexpr>*   s(      77Uc%jj777777r   )joinr!   )segmented_tokenssentences     r   make_sentencesrE   (   sY      $ @ @gg77h77777==??????@ @r   c                     t                               |           }t          j        t          |                    }d t	          |          D             }|S )Nc                     g | ]}|S r    r    r-   s     r   r$   z#syntok_tokenize.<locals>.<listcomp>/   s    888888r   )syntok_tokenizerr   syntok_segmenteriterrE   )r   tokensresultr'   s       r   syntok_tokenizerM   ,   sK    ##D))F#DLL11F88!7!7888HOr   c                      ||           }|S r   r    )big_texttokenize_funcr'   s      r   speed_benchmarkrQ   2   s    }X&&HOr   __main__zbenchmarks/1661-0.txtzSpeed : {:>20.2f} msi  )+r   r   pysbdspacystanzasyntok.tokenizerr   syntok.segmenter	segmenterrI   	Segmenterr%   blankr/   add_pipecreate_pipeloadr4   Pipeliner9   rH   r   r   r(   r1   r5   r;   rE   rM   rQ   __name__time	librariesrP   topenbigfilereadrO   r:   
time_takenprintformatr    r   r   <module>ri      s           & & & & & & + + + + + +!%/4uNNNek$ S__]++ , , ,
%*%w
7
7
7V_$:>>>
9;; 9 9 9$ $ $  
? ? ?C C C8 8 8@ @ @     zKKKI # 
@ 
@DIKKT)** 	&g||~~H	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	&#OHm<<	TY[[1_
m$%%%$++J,=>>????+ 
@ 
@s   'DD	D	