
    Xj                       d dl mZ d dlZd dlmZ  ee          Zd d ddddddd	d
dddZdddddZ ej	        d          Z
 ej	        d          Z ej	        d          Z ej	        d          Zd-dZd.dZd.dZd/d0d%Zd1d+Zd2d,ZdS )3    )annotationsN)
get_logger                        	   )u   零u   〇u   一u   二u   两u   三u   四u   五u   六u   七u   八u   九
   d   i  '  )u   十u   百u   千u   万u3   [零〇一二两三四五六七八九十百千万]+uF   [\s（）()\[\]【】《》〈〉<>〔〕第号,，。；;:：_\-—]+z^\[([^\]]+)\]\([^)]+\)$z\s+tokenstrreturnintc                ^   t          d | D                       s,t          d                    d | D                                 S d}d}d}| D ]X}|t          v rt          |         }t                              |          }|6|dk    r||z   |z  }||z  }d}n
||pd|z  z  }d}Y||z   |z   S )Nc              3  (   K   | ]}|t           v V  d S )N)	_CN_UNITS.0chars     _/lsinfo/ai/hellotax_ai/data_center/backend/app/services/tax_data_processor/relation_identity.py	<genexpr>z(_parse_chinese_number.<locals>.<genexpr>   s'      33Tty 333333     c              3  f   K   | ],}t          t                              |d                     V  -dS )r   N)r   
_CN_DIGITSgetr   s     r   r   z(_parse_chinese_number.<locals>.<genexpr>   s8      JJD3z~~dA6677JJJJJJr   r   r   r   )anyr   joinr    r   r!   )r   resultsectionnumberr   units         r   _parse_chinese_numberr(      s    33U33333 L277JJEJJJJJKKKFGF  :%F}}T""<5=='4/GgFGG!t++GGf$$r   value
str | Nonec                x    | sdS t                               d |           }t                              d|          S )Nr   c                `    t          t          |                     d                              S )Nr   )r   r(   group)matchs    r   <lambda>z,normalize_legal_doc_number.<locals>.<lambda>)   s"    4I%++VW..4Y4Y0Z0Z r   )_CN_NUMBER_REsub_DROP_CHARS_RE)r)   	converteds     r   normalize_legal_doc_numberr4   &   s?     r!!"Z"Z\abbIb),,,r   c                   | sdS |                                  }t                              |          }|r'|                    d                                           }|                     d          }t                              d|          S )Nr   r   u   《》〈〉[]()（） 	
)strip_MARKDOWN_LINK_REr.   r-   _WHITESPACE_REr1   )r)   titlelinks      r   normalize_relation_titler;   ,   sz     rKKMME""5))D &

1##%%KK788Eb%(((r   doc_id
int | None
doc_numberr9   
source_urlc                h   | rd|  S |rd|                                  S |pd                                 }|r+|                    d          st          |          }|rd| S |p|                    d          r|nd }t          |          }|rd| S t                              d||           dS )	Nzid:zurl:r   )u   《u   〈znum:ztitle:z(failed to generate relation identity key)r>   r9   )r6   
startswithr4   r;   loggerwarning)r<   r>   r9   r?   
raw_numbernormalized_numbertitle_valuenormalized_titles           r   relation_identity_keyrH   6   s     V~~ +*j&&((***"))++J .:00@@ .6zBB 	.-+---Z**?*?*O*OYJJUYK/<< +*(***
NN=*\aNbbb2r   external_numberexternal_titleinternal_doc_numberinternal_titleboolc                   t          |          }t          |          }|r
|r||k    rdS t          |           r|rt          |           |k    rdS t          |           }t          |          }|r|sdS ||k    p||v p||v S )NTF)r;   r4   )rI   rJ   rK   rL   	ext_title	int_title
ext_number
int_numbers           r   is_same_legal_referencerS   G   s    (88I(88I Y I$:$:t00 Y D\]lDmDmqzDzDzt+O<<J+,?@@J Z u#[zZ'?[:Q[C[[r   c           
     F   | r|sdS t          | dd           s$t          | dd           st          | dd           sdS t          t          | dd           t          | dd           t          |dd           t          |dd                     sdS d | _        d | _        d | _        dS )NFsuperseded_by_doc_numbersuperseded_by_titlesuperseded_by_source_urlr>   r9   T)getattrrS   rU   rV   rW   )doc
superseders     r   &clear_duplicate_external_superseded_byr[   T   s    j uC3T:: gcK`bf>g>g krsv  yS  UY  lZ  lZ u"730JD#Q#QSZ[^`uw{S|S|  F  GQ  S_  ae  f  f  ho  pz  |C  EI  hJ  hJ  K  K u#'C "C#'C 4r   )r   r   r   r   )r)   r*   r   r   )NNNN)
r<   r=   r>   r*   r9   r*   r?   r*   r   r   )
rI   r*   rJ   r*   rK   r*   rL   r*   r   rM   )r   rM   )
__future__r   recommon_loggingr   __name__rB   r    r   compiler0   r2   r7   r8   r(   r4   r;   rH   rS   r[    r   r   <module>rb      sV   " " " " " " 				 % % % % % %	H		q1QqQR[\efopyz  DE  F  F
s4>>	
PQQhiiBJ=>> F##% % % %,- - - -) ) ) )    "\ \ \ \
 
 
 
 
 
r   