
    jk+              
          d dl Z d dlmZmZmZmZ d dlZd dlmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ g dZdgfd	eee                  fd
Zd	eee                  dee         fdZdefdZdedefdZdefdZdedefdZdgfded	eee                  deee                  fdZ	 ddee         d	eee                  dedee         fdZdS )    N)IterableIteratorListOptional)DetectorFactorydetect_langslang_detect_exception)Element)logger)TESSERACT_LANGUAGES_SPLITTER)~aframharaasmazeaze_cyrlbelbenbodbosbrebulcatcebceschi_simchi_sim_vertchi_trachi_tra_vertchrcoscymdandeudivdzoellengenmepoequesteusfaofasfilfinfrafrkfrmfryglagleglggrcgujhathebhinhrvhunhyeikuindislitaita_oldjavjpnjpn_vertkankatkat_oldkazkhmkirkmrkorkor_vertlaolatlavlitltzmalmarmkdmltmonmrimsamyanepnldnorocioriosdpanpolporpusqueronrussansinslkslvsndsnumspaspa_oldsqisrpsrp_latnsunswaswesyrtamtatteltgkthatirtonturuigukrurduzbuzb_cyrlvieyidyorr(   	languagesc                 D   | t          d          t          t          d d | D                                 }t          t                              |                    }t          |          dk    rt          j        d|  d           dS t          j	        |          S )	zf
    Entry point: convert languages (list of strings) into tesseract ocr langcode format (uses +)
    Nz`languages` can not be `None`c                     | d uo| dk    S )N  )xs    e/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/partition/lang.py<lambda>z1prepare_languages_for_tesseract.<locals>.<lambda>   s    atm/R     c                 ,    g | ]}t          |          S r   )convert_language_to_tesseract.0langs     r   
<listcomp>z3prepare_languages_for_tesseract.<locals>.<listcomp>   s!    GGGT*400GGGr   r   z@Failed to find any valid standard language code from languages: z, proceed with `eng` instead.r(   )

ValueErrorlistfilterdictfromkeyslenr   warningr   join)r   converted_languagess     r   prepare_languages_for_tesseractr      s     8999//GGYGGG	
 	
  t}}-@AABB
1$$C#C C C	
 	
 	
 u',-@AAAr   ocr_languagesc                     | dg} t          | t                    st          d          |9| dgk    rt          d          t	          |          } t          j        d           | S )zHandle `ocr_languages` and `languages`, defining `languages` to ['eng'] as default and
    converting `ocr_languages` if neededNr(   zOThe language parameter must be a list of language codes as strings, ex. ['eng']z}Only one of languages and ocr_languages should be specified. languages is preferred. ocr_languages is marked for deprecation.zmThe ocr_languages kwarg will be deprecated in a future version of unstructured. Please use languages instead.)
isinstancer   	TypeErrorr   &convert_old_ocr_languages_to_languagesr   r   )r   r   s     r   check_languagesr      s     G	i&& 
]
 
 	
  S   ?}MMIN0   r   c                 6    |                      t                    S )z
    Convert ocr_languages parameter to list of langcode strings.
    Assumption: ocr_languages is in tesseract plus sign format
    )splitr   )r   s    r   r   r      s     ;<<<r   r   returnc                 R   | t           v r| S 	 t          j                            |                                           }n-# t          j        $ r t          j        |  d           Y dS w xY wd t           D             }|j        |v r(t          |j                  }t          j        |          S |j        |v r(t          |j                  }t          j        |          S |j        |v r(t          |j                  }t          j        |          S t          j        |  d           dS )zf
    Convert a language code to its tesseract formatted and recognized langcode(s), if supported.
    z' is not a valid standard language code.r   c                 "    h | ]}|d d         S )N   r   r   s     r   	<setcomp>z0convert_language_to_tesseract.<locals>.<setcomp>   s     BBB48BBBr   z* is not a language supported by Tesseract.)PYTESSERACT_LANGSiso639LanguagematchlowerLanguageNotFoundErrorr   r   part3(_get_all_tesseract_langcodes_with_prefixr   r   part2bpart2t)r   lang_iso639pytesseract_langs_3matched_langcodess       r   r   r      sK       o++DJJLL99'   $GGGHHHrr CB0ABBB ///D[EVWW+01BCCC 
	2	2	2D[EWXX+01BCCC 
	2	2	2D[EWXX+01BCCC 	$JJJKKKrs   1? &A)(A)prefixc                 *      fdt           D             S )zb
    Get all matching tesseract langcodes with this prefix (may be one or multiple variants).
    c                 >    g | ]}|                               |S r   )
startswith)r   langcoder   s     r   r   z<_get_all_tesseract_langcodes_with_prefix.<locals>.<listcomp>   s,    VVV(:M:Mf:U:UVHVVVr   )r   )r   s   `r   r   r      s!     WVVV%6VVVVr   c                     t           j                            | dd                                                   }|j        S )zP
    Convert a language code to the standard internal language code format.
    Nr   )r   r   r   r   r   )r   r   s     r   _convert_to_standard_langcoder      s4    
 /''RaR(8(899Kr   autotextc                    t          |t                    st          d          |d         dk    s|                                 dk    rdS t	          j        d|           r(t          |                                           dk     rdgS dt          _	        |rd|vrd	 |D             }nt          |          d
k    rt          j        d| d           	 t          |           }n1# t          j        $ r}t          j        |           Y d}~dS d}~ww xY wd |D             }g }|D ]}||vr|                    |           |S )z
    Detects the list of languages present in the text (in the default "auto" mode),
    or formats and passes through the user inputted document languages if provided.
    zOThe language parameter must be a list of language codes as strings, ex. ["eng"]r   r   Nz^[\x00-\x7F]+$   r(   r   c                 ,    g | ]}t          |          S r   )r   r   s     r   r   z$detect_languages.<locals>.<listcomp>%  s!    SSS6t<<SSSr      z9Since "auto" is present in the input languages provided (z]), the language will be auto detected and the rest of the inputted languages will be ignored.c                     g | ]?}|j                             d           rt          d           nt          |j                   @S )zh)r   r   r   )r   langobjs     r   r   z$detect_languages.<locals>.<listcomp>;  sZ     
 
 
  |&&t,,=)$///.w|<<
 
 
r   )r   r   r   striprer   r   r   r   seedr   r   r   r	   LangDetectExceptionappend)r   r   doc_languageslangdetect_resultelangdetect_langsr   s          r   detect_languagesr     s    i&& 
]
 
 	
 |rTZZ\\R//t 
x!4(( S->->-B-Bw O
  "+V9,,SSSSS
 y>>AN-I - - -  	 ,T 2 2$8 	 	 	N144444	
 
 -	
 
 
 $ 	+ 	+D=(($$T***s   C# #D2DDFelementsdetect_language_per_elementc              #     K   |dg}|dgk    r
| E d{V  dS t          | t                    st          |           } d                    d | D                       }t	          ||          }|.t          |          dk    r|du r| D ]}||j        _        |V  dS | D ]9}t          |d	          r#t	          |j	                  |j        _        |V  5|V  :dS )
zBDetect and apply metadata.languages to each element in `elements`.Nr   r    c              3   D   K   | ]}t          |d           |j        V  dS )r   N)hasattrr   )r   r   s     r   	<genexpr>z&apply_lang_metadata.<locals>.<genexpr>d  s3      HHAWQ5G5GHHHHHHHr   )r   r   r   Fr   )
r   r   r   r   r   r   metadatar   r   r   )r   r   r   	full_textdetected_languagesr   s         r   apply_lang_metadatar   K  s:      H	 RDh%% ">>HHHHHHHI)yINNN&	NNa'500  	 	A#5AJ GGGG	 	  	 	Aq&!! '7'?'?
$	 	r   )F)r   typingr   r   r   r   r   
langdetectr   r   r	   unstructured.documents.elementsr
   unstructured.loggerr   &unstructured.partition.utils.constantsr   r   strr   r   r   r   r   r   r   boolr   r   r   r   <module>r      s@   				 5 5 5 5 5 5 5 5 5 5 5 5  K K K K K K K K K K 3 3 3 3 3 3 & & & & & & O O O O O O   D GLW B BxS	/B B B B B0xS	2 8C=    6=# = = = =& & & & & &RWS W W W W      '-XD D
DS	"D d3iD D D DT ).* *w*S	"* "&* g	* * * * * *r   