o
    Bi02                     @  s   d Z ddlmZ ddlZddlmZmZ ddlmZ ddl	m
Z
mZ ddlZeeZdZeded	ed
edgZedZedZdZG dd deeZeG dd dZG dd dZdddZe ZdS )z
Tax Document Reference Extractor
Pure rule-based extraction of inter-document relations from Chinese tax documents.
No LLM dependency.
    )annotationsN)	dataclassfield)Enum)ListOptionalu   [\[\]\[\]〔〕（()]uF   [\u4e00-\u9fa5]{2,6}(?:[\[\[〔（(][12]\d{3}[\]\]〕)][^号]{1,6}号)uJ   [\u4e00-\u9fa5]{2,8}第?(?:[\[\[〔（(][12]\d{3}[\]\]〕)][^号]{1,6}号)uS   (?:国家税务总局|财政部|海关总署)[^公]*公告[12]\d{3}年第\d{1,4}号u4   (?:国务院|财政部|税务总局)令第\d{1,4}号u   (?:[\u4e00-\u9fa5]{2,8}第?[\[\[〔（(][12]\d{3}[\]\]〕)][^号]{1,6}号|(?:国家税务总局|财政部|海关总署)[^公]*公告[12]\d{3}年第\d{1,4}号|(?:国务院|财政部|税务总局)令第\d{1,4}号)u]   第[一二三四五六七八九十百\d]+条(?:第[一二三四五六七八九十百\d]+款)?d   c                   @  s(   e Zd ZdZdZdZdZdZdZdZ	dS )	RelationType
REFERENCESANNULSAMENDS	HAS_ANNEX	ISSUED_BYSUPERSEDED_BYEFFECTIVE_FROMN)
__name__
__module____qualname__r
   r   r   r   r   r   r    r   r   R/lsinfo/ai/hellotax_ai/base_platform/app/services/knowledge/reference_extractor.pyr	   A   s    r	   c                   @  sR   e Zd ZU ded< ded< ded< ded< dZd	ed
< dZd	ed< dZd	ed< dS )DocumentRelationstrsource_doc_idr	   relation_typecontextfloat
confidenceNzOptional[str]target_doc_idtarget_doc_numberarticle_number)r   r   r   __annotations__r   r   r   r   r   r   r   r   K   s   
 r   c                   @  sP   e Zd ZdZ	ddd	d
ZdddZdddZdddZdddZdddZ	dS )TaxReferenceExtractorzV
    Pure rule-based extractor for inter-document relations in Chinese tax texts.
     textr   doc_id
doc_numberreturnList[DocumentRelation]c           	        s   g }| | || | | || | | || | | ||  r1 fdd|D }t }g }|D ]}|j|j|jf}||vrP|	| |
| q8td|t| |S )aH  
        Entry point.  Runs all sub-extractors and deduplicates.

        Args:
            text: Full document text.
            doc_id: Identifier of the source document.
            doc_number: Document number of the source (used to avoid self-refs).

        Returns:
            List of DocumentRelation instances.
        c                   s   g | ]	}|j  kr|qS r   )r   .0rr%   r   r   
<listcomp>x   s
    
z;TaxReferenceExtractor.extract_relations.<locals>.<listcomp>z5reference_extractor: doc_id=%s extracted %d relations)extend_extract_references_extract_annuls_extract_amendments_extract_annexessetr   r   r   addappendloggerdebuglen)	selfr#   r$   r%   resultsseendedupedr*   keyr   r+   r   extract_relations_   s,   


z'TaxReferenceExtractor.extract_relationsc                   s  g }t dtj d tj d t j}||D ]$}|d |d}t||	 |
 }|t|tj ||dd qt|D ]L}|d ||
 |
 d	  }t|}	|	ra|	dnd
}t||	 |
 |	rq|	
 nd }t fdd|D s|t|tj ||dd qBt dtj d t j}
|
|D ]}|d t||	 |
 }|t|tj |dd q|S )u   
        Match patterns like:
          依据/根据/按照 + <doc-number-or-law> + 第X条
          参见/详见 + <doc-number>
        u>   (?:依据|根据|按照|依照|遵照)([^，。；\n]{0,60}?)(   )([^，。；\n]{0,30}?)()?      ?r   r   r   r   r   r   r      Nc                 3  s$    | ]}|j  ko|jd kV  qdS )rB   N)r   r   r(   doc_numr   r   	<genexpr>   s   " z<TaxReferenceExtractor._extract_references.<locals>.<genexpr>gffffff?u-   (?:参见|详见|见)([^，。；\n]{0,10}?)()g333333?r   r   r   r   r   )recompile_DOC_NUMBER_REpattern_ARTICLE_REUNICODEfinditergroup_snippetstartendr4   r   r	   r
   searchany)r8   r#   r$   	relationsref_triggermarticlectxsuffixart_msee_triggerr   rE   r   r.      sx   





$


z)TaxReferenceExtractor._extract_referencesc                 C  s   g }t dtj d t j}||D ]}|d}t|| |	 }|
t|tj||dd qt dt j}||D ]+}|d}	t|	D ]}
|
d}t|	|
 |
	 }|
t|tj||dd qJq>|S )	u   
        Match:
          废止/失效/不再执行/停止执行 + <doc-number>
          同时废止以下文件 + list
        uN   (?:废止|宣布失效|不再执行|停止执行|废除)[^，。；\n]{0,20}?(rH      gq=
ףp?rI   u   (?:同时废止|一并废止|予以废止)(?:以下)?(?:文件|规定|通知|公告)?[：:]([\s\S]{0,500}?)(?=\n\n|\Z|本(?:通知|公告|办法))r   )\(?)rJ   rK   rL   rM   rO   rP   rQ   rR   rS   rT   r4   r   r	   r   )r8   r#   r$   rW   annul_rerY   rF   r[   list_annul_reblocknum_mr   r   r   r/      sJ   

	


z%TaxReferenceExtractor._extract_annulsc           	      C  s~   g }t dtj d tj d t j}||D ]$}|d}|d}t||	 |
 }|t|tj|||dd q|S )uF   
        Match: 修改/调整/修订 + <doc-number> + 第X条
        u;   (?:修改|调整|修订|补充修改)[^，。；\n]{0,20}?(r>   r?   r_      r`   rC   )rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   r4   r   r	   r   )	r8   r#   r$   rW   amend_rerY   rF   rZ   r[   r   r   r   r0     s4   


	z)TaxReferenceExtractor._extract_amendmentsc           	      C  s   g }t dt j}t d}||D ]-}|d }||s't|dk r(qt||	 |
 }|t|tjd||dd q|S )u   
        Match: 附件[1234一二三四]：<title>
        跳过纯编号行（如 "附件：1."），只提取有实际标题的附件引用
        uN   附件[\s]*[1234567890一二三四五六七八九十]?[\s]*[：:]([^\n]{1,80})u5   ^[\d一二三四五六七八九十]+[\.、）\)]*\s*$r_   re   Ngffffff?)r   r   r   r   r   r   )rJ   rK   rO   rP   rQ   stripmatchr7   rR   rS   rT   r4   r   r	   r   )	r8   r#   r$   rW   annex_reskip_rerY   annex_titler[   r   r   r   r1   "  s(   

	z&TaxReferenceExtractor._extract_annexes	List[str]c                 C  sD   t  }g }t|D ]}|d}||vr|| || q
|S )zKReturn all document numbers found in text (deduplicated, order-preserving).r   )r2   rL   rP   rQ   r3   r4   )r8   r#   r:   resultrY   numr   r   r   _extract_doc_numbersF  s   


z*TaxReferenceExtractor._extract_doc_numbersN)r"   )r#   r   r$   r   r%   r   r&   r'   )r#   r   r$   r   r&   r'   )r#   r   r&   rl   )
r   r   r   __doc__r=   r.   r/   r0   r1   ro   r   r   r   r   r!   Z   s    
1
E
0
$r!   r#   r   rS   intrT   r&   c                 C  s8   | t d|tt| | }t|tkr|dt S |S )z7Return a context snippet capped at _CTX_MAX characters.r   N)maxminr7   _CTX_MAX)r#   rS   rT   rawr   r   r   rR   V  s   rR   )r#   r   rS   rq   rT   rq   r&   r   )rp   
__future__r   rJ   dataclassesr   r   enumr   typingr   r   logging	getLoggerr   r5   _CN_BRACKETrK   DOC_NUMBER_PATTERNSrL   rN   rt   r   r	   r   r!   rR   tax_reference_extractorr   r   r   r   <module>   sD    

	
 
}
	