o
    Ղi                     @  s   d Z ddlmZ ddlZddddddddd	d
dddZdddddZedZedZedZ	edZ
d3ddZd4dd Zd4d!d"Z				d5d6d(d)Zd7d/d0Zd8d1d2ZdS )9zEHelpers for identifying duplicate legal-document relation references.    )annotationsN                        	   )u   零u   〇u   一u   二u   两u   三u   四u   五u   六u   七u   八u   九
   d   i  '  )u   十u   百u   千u   万u3   [零〇一二两三四五六七八九十百千万]+uF   [\s（）()\[\]【】《》〈〉<>〔〕第号,，。；;:：_\-—]+z^\[([^\]]+)\]\([^)]+\)$z\s+tokenstrreturnintc                 C  s   t dd | D stddd | D S d}d}d}| D ]0}|tv r(t| }qt|}|d u r2q|dkrC|| | }||7 }d}n||pGd| 7 }d}q|| | S )Nc                 s  s    | ]}|t v V  qd S )N)	_CN_UNITS.0char r   _/lsinfo/ai/hellotax_ai/data_center/backend/app/services/tax_data_processor/relation_identity.py	<genexpr>   s    z(_parse_chinese_number.<locals>.<genexpr> c                 s  s     | ]}t t|d V  qdS )r   N)r   
_CN_DIGITSgetr   r   r   r   r      s    r   r   r   )anyr   joinr   r   r   )r   resultsectionnumberr   unitr   r   r   _parse_chinese_number   s&   
r#   value
str | Nonec                 C  s$   | sdS t dd | }td|S )zENormalize common legal doc-number variants for same-reference checks.r   c                 S  s   t t| dS )Nr   )r   r#   group)matchr   r   r   <lambda>:   s    z,normalize_legal_doc_number.<locals>.<lambda>)_CN_NUMBER_REsub_DROP_CHARS_RE)r$   	convertedr   r   r   normalize_legal_doc_number5   s   r-   c                 C  sB   | sdS |   }t|}|r|d  }| d}td|S )zFNormalize a legal document title for exact same-title relation checks.r   r   u   《》〈〉[]()（） 	
)strip_MARKDOWN_LINK_REr'   r&   _WHITESPACE_REr*   )r$   titlelinkr   r   r   normalize_relation_title>   s   

r3   doc_id
int | None
doc_numberr1   
source_urlc                 C  s   | rd|  S |rd|   S |pd  }|r(|ds(t|}|r(d| S |p2|dr1|nd}t|}|r>d| S dS )z6Return a stable key for de-duplicating relation items.zid:zurl:r   )u   《u   〈znum:Nztitle:)r.   
startswithr-   r3   )r4   r6   r1   r7   
raw_numbernormalized_numbertitle_valuenormalized_titler   r   r   relation_identity_keyK   s   


r=   external_numberexternal_titleinternal_doc_numberinternal_titleboolc                 C  st   t |}t |}|r|r||krdS t | r |r t | |kr dS t| }t|}|r,|s.dS ||kp9||v p9||v S )zVReturn True when an external superseded-by field duplicates an internal document link.TF)r3   r-   )r>   r?   r@   rA   	ext_title	int_title
ext_number
int_numberr   r   r   is_same_legal_referenced   s   rG   c                 C  s|   | r|sdS t | ddst | ddst | ddsdS tt | ddt | ddt |ddt |dds3dS d| _d| _d| _dS )	zUClear external superseded-by fields when they duplicate the linked internal document.Fsuperseded_by_doc_numberNsuperseded_by_titlesuperseded_by_source_urlr6   r1   T)getattrrG   rH   rI   rJ   )doc
supersederr   r   r   &clear_duplicate_external_superseded_byy   s(   






rN   )r   r   r   r   )r$   r%   r   r   )NNNN)
r4   r5   r6   r%   r1   r%   r7   r%   r   r   )
r>   r%   r?   r%   r@   r%   rA   r%   r   rB   )r   rB   )__doc__
__future__r   rer   r   compiler)   r+   r/   r0   r#   r-   r3   r=   rG   rN   r   r   r   r   <module>   s>    






	
