o
    Ղiv                     @  s8  d Z ddlmZ ddlZddlmZmZ ddlmZ ddl	m
Z
mZ ddlmZmZ ddlmZ dd	lmZmZmZmZmZ h d
ZdZddddddddZedejZddddZdddddddZddddZedZ ededfZ!eddG dd dZ"eG dd  d Z#eddG d!d" d"Z$G d#d$ d$Z%dd*d+Z&dd.d/Z'	0	1		ddd9d:Z(	1ddd=d>Z)dd@dAZ*ddDdEZ+ddGdHZ,ddJdKZ-ddNdOZ.dddTdUZ/ddWdXZ0dd[d\Z1		dddadbZ2ddddeZ3ddldmZ4ddndoZ5ddqdrZ6ddudvZ7ddwdxZ8dd{d|Z9dddZ:dddZ;dS )a)  Build and normalize document relationship fields.

This module is the single source of truth for populating tax_documents.supersedes
and tax_documents.references. It is used by both the Celery rebuild task and the
one-off repair script so that full repairs and future rebuilds use the same rules.
    )annotationsN)	dataclassfield)date)AnyIterable)Session	load_only)TaxDocument)&clear_duplicate_external_superseded_byis_same_legal_referencenormalize_legal_doc_numbernormalize_relation_titlerelation_identity_key>   	normative
regulationlocal_policy   	effectiveobsoletepartially_obsoleteamended)u   全文有效u   全文废止u   全文失效u   部分废止u   部分失效u	   已修订u   部分修订zr<p[^>]+class=["\'][^"\']*arc_date[^"\']*["\'][^>]*>.*?<span[^>]+class=["\'][^"\']*xg[^"\']*["\'][^>]*>(.*?)</span>         )amendspartially_supersedes
supersedes   )N r   r   r   r   u7   ^(?P<title>.+?)修正案(?:[（(][^）)]{1,20}[）)])?$uD   关于(?:修改|修订)[《〈](?P<title>[^》〉\n]{2,100})[》〉]ux   关于[^。\n]{0,80}修订(?:后|后的|发布|印发)?[^《〈。\n]{0,20}[《〈](?P<title>[^》〉\n]{2,100})[》〉]T)frozenc                   @  sR   e Zd ZU ded< ded< ded< ded< ded< d	ed
< ded< dZded< dS )RelationTargetintidstrtitle
str | None
doc_number
source_urlcategory_iddate | None
issue_datedoc_typeNsource_status)__name__
__module____qualname____annotations__r-    r2   r2   ^/lsinfo/ai/hellotax_ai/data_center/backend/app/services/tax_data_processor/relation_builder.pyr!   9   s   
 r!   c                   @  s$  e Zd ZU dZded< dZded< dZded< dZded< dZded< dZ	ded< dZ
ded	< dZded
< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< eedZded< d$ddZd%d!d"Zd#S )&RelationBuildStatsr   r"   documents_processeddocuments_changedsupersedes_self_removedreferences_self_removed references_removed_by_supersedessupersedes_dedupedreferences_dedupedsupersedes_resolvedreferences_resolvedsupersedes_unresolvedreferences_unresolved supersedes_demoted_to_references!references_invalid_target_removed non_regulation_documents_skippedreverse_links_changedinvalid_reverse_links_clearedstatuses_changedstatus_review_neededduplicate_external_clearederrors)default_factoryz	list[str]samplesother'RelationBuildStats'returnNonec                 C  sD   | j D ]}|dkr| j|j qt| |t| |t||  qd S )NrJ   )__dataclass_fields__rJ   extendsetattrgetattr)selfrK   namer2   r2   r3   add]   s
   
zRelationBuildStats.adddict[str, Any]c                   s    fdd j D d jiB S )Nc                   s    i | ]}|d kr|t  |qS )rJ   )rR   ).0rT   rS   r2   r3   
<dictcomp>e   s
    
z.RelationBuildStats.as_dict.<locals>.<dictcomp>rJ   )rO   rJ   rX   r2   rX   r3   as_dictd   s
   
zRelationBuildStats.as_dictN)rK   rL   rM   rN   )rM   rV   )r.   r/   r0   r5   r1   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   r   listrJ   rU   rZ   r2   r2   r2   r3   r4   E   s0   
 
r4   c                   @  s.   e Zd ZU ded< ded< ded< ded< dS )	ReverseCandidater"   	source_id	target_idr$   relationr*   source_issue_dateN)r.   r/   r0   r1   r2   r2   r2   r3   r\   l   s
   
 r\   c                   @  s   e Zd ZdZd+ddZed,d
dZ			d-d.ddZd/ddZe	d0ddZ
				d1d2dd Ze	d3d"d#Ze	d4d)d*ZdS )5RelationResolverz>Resolve raw relation references to unique TaxDocument targets.docsIterable[TaxDocument]c                 C  s   i | _ i | _i | _i | _g | _|D ]Z}t|j|j|j|j	|j
|j|jt|dd p/tt|dd d}|| j |j< |j	rC|| j|j	 < | |jD ]
}| | j|| qIt|j}|rk| j||f | | j|| qd S )Nr-   content_html)r#   r%   r'   r(   r)   r+   r,   r-   )
docs_by_idby_source_urlby_doc_numberby_titletitle_targetsr!   r#   r%   r'   r(   r)   r+   r,   rR   source_status_from_htmlstrip_doc_number_keys_add_uniquer   append)rS   rb   doctargetkey	title_keyr2   r2   r3   __init__w   s8   
zRelationResolver.__init__dbr   rM   'RelationResolver'c                 C  sF   | tttjtjtjtjtjtj	tj
tjtj }| |S N)queryr
   optionsr	   r#   r%   r'   r(   r)   r+   r,   rd   order_byall)clsrt   rb   r2   r2   r3   from_db   s"   zRelationResolver.from_dbNr'   r&   r%   r(   RelationTarget | Nonec                 C  s   |r| j | }|r|S |pd }|dr&| jt|}|r%|S n| |D ]}| j|}|r9|  S q+t|}|rJ| j|}|rJ|S d S )Nr      《u   〈)rf   getrk   
startswithrh   r   rl   rg   )rS   r'   r%   r(   rp   
raw_numberrq   rr   r2   r2   r3   lookup   s*   
zRelationResolver.lookup
source_docr
   c                   sB   t |tdk rd S  fdd| jD }|sd S | | S )Nr   c                   s8   g | ]\}}|j  j krt|r|v s|v r|qS r2   )r#   is_regulation_like)rW   
target_keyrp   r   rr   r2   r3   
<listcomp>   s    z<RelationResolver.lookup_containing_title.<locals>.<listcomp>)r   lenri   _best_title_candidate)rS   r%   r   
candidatesr2   r   r3   lookup_containing_title   s   z(RelationResolver.lookup_containing_titler   list[RelationTarget]r!   c                   s   t |dd t |dd  d fdd}rAfdd	| D }|r,||}t|d
d dS dd	 | D }|rA||}t|dd dS || }t|dd dS )Nr+   r)   poolr   rM   c                   s    fdd| D }|p| S )Nc                   s   g | ]	}|j  kr|qS r2   )r)   rW   rp   source_categoryr2   r3   r      s    zXRelationResolver._best_title_candidate.<locals>.prefer_same_category.<locals>.<listcomp>r2   )r   same_categoryr   r2   r3   prefer_same_category   s   zDRelationResolver._best_title_candidate.<locals>.prefer_same_categoryc                   s    g | ]}|j r|j  kr|qS r2   r+   r   )source_dater2   r3   r      s    
z:RelationResolver._best_title_candidate.<locals>.<listcomp>c                 S     | j ptj| jfS rv   r+   r   minr#   rp   r2   r2   r3   <lambda>       z8RelationResolver._best_title_candidate.<locals>.<lambda>)rq   c                 S  s   g | ]}|j r|qS r2   r   r   r2   r2   r3   r      s    c                 S  r   rv   )r+   r   maxr#   r   r2   r2   r3   r      r   c                 S  r   rv   r   r   r2   r2   r3   r      r   )r   r   rM   r   )rR   r   r   )r   r   r   	not_afterr   afterr2   )r   r   r3   r      s    
z&RelationResolver._best_title_candidaterp   boolc           
      C  s   |r
|j |j kr
dS |r|jr| |j krdS |pd }t|j}|r6|ds6|r6t||kr6dS t|j}t|pF|drE|nd }	t|	oP|oP|	|kS )NTr   r~   )	r#   r(   rk   r   r'   r   r   r%   r   )
rS   r   r'   r%   r(   rp   r   
own_number	own_title	raw_titler2   r2   r3   is_self_reference   s   

z"RelationResolver.is_self_referenceset[str]c                 C  sd   | st  S |  }|dddd}t|}|||h}|dr+||ddd dd |D S )	N r      　u   中华人民共和国主席令u	   主席令r   c                 S     h | ]}|r|qS r2   r2   rW   rq   r2   r2   r3   	<setcomp>      z4RelationResolver._doc_number_keys.<locals>.<setcomp>)setrk   replacer   r   rU   )r'   strippedcompact
normalizedkeysr2   r2   r3   rl     s   

z!RelationResolver._doc_number_keysindex dict[str, RelationTarget | None]rq   r$   rN   c                 C  sF   |  |}|d u r|| v rd S |r|j|jkrd | |< d S || |< d S rv   )r   r#   )r   rq   rp   existingr2   r2   r3   rm     s   
zRelationResolver._add_unique)rb   rc   )rt   r   rM   ru   )NNN)r'   r&   r%   r&   r(   r&   rM   r}   )r%   r&   r   r
   rM   r}   )r   r   r   r
   rM   r!   )NNNN)r   r
   r'   r&   r%   r&   r(   r&   rp   r}   rM   r   )r'   r&   rM   r   )r   r   rq   r$   rp   r!   rM   rN   )r.   r/   r0   __doc__rs   classmethodr|   r   r   staticmethodr   r   rl   rm   r2   r2   r2   r3   ra   t   s*    
!
 ra   ro   r
   resolverrM   Stuple[list[dict[str, Any]] | None, list[dict[str, Any]] | None, RelationBuildStats]c                 C  sf   t | sd d tdddfS || jpd}t| |}|r-i |dg |dp'g |i}t| ||S )Nr   )r5   rB   r   r   )r   r4   extractcontent_markdown_title_inferred_supersedesr   normalize_extracted_relations)ro   r   	extractorreltitle_supersedesr2   r2   r3   build_relations_for_document*  s   
r   r   rV   c              	   C  s  t dd}t| sd|_d d |fS t|di }i }g }|dg p$g D ]}t|ts-q%|dp3d }|dpA|t|}	|d}
|d	pMd}|t	vrTd}|
||	|
}|| ||	|
|rl| jd7  _q%t| ||s| jd7  _|t||	||
d
 q%t||	|||
d}t|}|sq%t|||| |r| jd7  _q%| jd7  _q%t| }t|}i }|D ]=}t| |d|s| jd7  _qt|}||@ r| jd7  _qt|||r|dr| jd7  _q| jd7  _q|dg pg D ]_}|j
|d
}|sq|j| ||dr"| jd7  _qt|j p(d|j!||d
}t| |d|sB| jd7  _qt|}||@ rT| jd7  _qt||| | jd7  _q|dg pkg D ]}t|tsvql|dp}d }|dp|t|}	|d}
|
||	|
}|| ||	|
|r| jd7  _qlt||	||
d
}t| |d|s| jd7  _qlt|}||@ r| jd7  _qlt|||r|r| jd7  _ql| jd7  _qlt| }|pd |pd |fS )Nr   )r5   doc_num_to_titler   r'   r   r%   r(   r_   r(   )r_   r(   doc_idreference_urls)r(   rp   
references)"r4   r   rB   _expand_doc_num_to_titler   
isinstancedictrk   _doc_num_title_keyRELATION_RANKr   r   r7   can_apply_legal_relationr@   rn   _relation_entry_primary_key_add_supersedes_entryr<   r>   r[   values_all_relation_keyscan_keep_referencerA   _entry_keysr9   _add_reference_entryr=   r?   r8   r'   r%   )ro   r   r   statsr   supersedes_by_keydemoted_referencesraw_itemr   r   r(   r_   rp   entryrq   r   supersedes_keysreferences_by_keyr   urlr   r2   r2   r3   r   >  s   







r      Frt   r   
batch_sizer"   dry_runr   limitc                 C  s  ddl m} | }t| }t }i }	| tttj	tj
tjtjtjtjtjtjtjtj
tjdtj	}
|
 }|rFt||}d}d}||k rt||| }|
|| }|scn|D ]}}zOt|||\}}}|| ||	|j	< t|jt|kst|jt|kr| jd7  _t|j dk r|j !d|j	 d|j
dd   |s||_||_W n' t"y } z| j#d7  _#|r|$d	|j	 d
|  W Y d}~nd}~ww |d7 }qe|s| %  |t|7 }|r|&d| d| d|j  ||k sO|j'( D ]%}|j	|	vr.| tjtj	|j	k) }t*|t+r)|nd|	|j	< q
t,| ||	|d}|| |sD| %  |S )z?Re-extract and normalize relation fields for the full database.r   )RelationshipExtractorNr   r   zid=z title=P   z[build_relations] doc_id=u    处理异常: u   [build_relations] 进度: /z	 changed=)r   )->app.services.tax_data_processor.parsers.relationship_extractorr   ra   r|   r4   rw   r
   rx   r	   r#   r%   r'   r(   r)   r+   r,   r   r   r   filterisnotry   countr   offsetr   rz   r   rU   _json_relation_valuer6   r   rJ   rn   	ExceptionrH   warningcommitinfore   r   scalarr   r[   apply_reverse_relation_updates)rt   r   r   loggerr   r   r   r   r   relations_by_doc_idrw   total	processedr   current_limitbatchro   r   r   	doc_statsexcrp   r   reverse_statsr2   r2   r3   rebuild_document_relations  s   




$$
 
r   r   &dict[int, list[dict[str, Any]] | None]c                 C  s  t  }t||}| ttjd  }t|dd |	 D B dd |D B dd |D B }|rA| ttj
| ng }dd |D }	t|}
|D ].}|j
|
v rXqP|	|j}|rft||rfqP| jd7  _|jdv ry| jd7  _|s~d |_qP| D ][\}}|	|}|	|j}|r|sq|j|jkr| jd7  _|s|j|_t|j|j|j|j}|r| jd7  _|st|| t|j|}|r||jkr| jd7  _|s||_q|S )	Nc                 S     h | ]}|j qS r2   )r]   )rW   	candidater2   r2   r3   r     r   z1apply_reverse_relation_updates.<locals>.<setcomp>c                 S  r   r2   r#   rW   ro   r2   r2   r3   r     r   c                 S  s   h | ]}|j r|j qS r2   )superseded_by_doc_idr   r2   r2   r3   r     s    c                 S  s   i | ]}|j |qS r2   r   r   r2   r2   r3   rY     r   z2apply_reverse_relation_updates.<locals>.<dictcomp>r   )r   r   r   )r4   _reverse_candidatesrw   r
   r   r   r   rz   r   r   r#   in_r   r   rD   
doc_statusrF   itemsr]   rC   r   superseded_by_doc_numbersuperseded_by_titler'   r%   rG   r   _status_from_relationr_   rE   )rt   r   r   r   r   r   linked_docsdoc_idsrb   re   candidate_target_ids
target_docr   r^   r   should_clear_externalproposed_statusr2   r2   r3   r     sx   






r   dict[int, ReverseCandidate]c                 C  s   i }|  D ][\}}| j|}|rt|tsq|D ]F}t|ts"q|d}|r-||kr.q| j|}|dp:d}	|rCt|||	sDqt|||	|jd}
||}|r\t	|
t	|kr`|
||< qq|S )Nr   r_   r   )r]   r^   r_   r`   )
r  re   r   r   r[   r   r   r\   r+   _candidate_sort_key)r   r   r   r]   r   sourceitemr^   rp   r_   r   r   r2   r2   r3   r   K  s6   


r   r   tuple[int, date, int]c                 C  s   t | jd| jptj| jfS Nr   )r   r   r_   r`   r   r   r]   )r   r2   r2   r3   r  j  s   
r  r   c                 C  s   t | dd tv S )Nr,   )rR   REGULATION_DOC_TYPES)ro   r2   r2   r3   r   r  s   r   
int | Nonec                 C  sR   t | sdS t| dddkr't| dd}t|tr'd|  kr$dkr'|S  tS tS )z/Return lower number for higher legal hierarchy.Nr,   r   r)   r      )r   rR   r   r"   LOCAL_RELATION_LEVEL)ro   r)   r2   r2   r3   legal_relation_levelv  s   r  rd   r&   c                 C  sX   | sd S t | }|sd S tdd|d }t D ]\}}||v r)|  S qd S )Nz<[^>]+>r   r   )SOURCE_STATUS_REsearchresubgrouprk   SOURCE_STATUS_MAPr  )rd   matchstatus_textlabelstatusr2   r2   r3   rj     s   
rj   r  rp   
Any | Noner_   c                 C  s   t | sdS |p	d}|du rdS t |sdS |dkr@t|dd}|du r,tt|dd}|dkr2dS | jr@|jr@| j|jk r@dS t| }t|}|du sP|du rRdS ||krXdS dS )	z9Return whether source can legally supersede/amend target.Fr   NTr   r-   rd   r   )r   rR   rj   r+   r  )r  rp   r_   r-   source_leveltarget_levelr2   r2   r3   r     s,   r   r^   c                 C  s$   |sdS |j |}t|ot|S )zAResolved references may only point to legal/regulatory documents.T)re   r   r   r   )r  r^   r   rp   r2   r2   r3   r     s   r   r$   r	  c                 C  sJ   |j tvrd S t| }|sd S t|jd}t|d}||kr#|S d S r  )r,   r  STATUS_BY_RELATIONr   STATUS_RANKr  )r_   r	  proposedcurrent_rankproposed_rankr2   r2   r3   r    s   

r  r   r   r}   r(   c                 C  sL   |r|j r|j n| |r|jnd |r|jn|d}|r||d< |r$||d< |S )N)r'   r   r%   r_   r(   )r'   r#   r%   )r   r   rp   r_   r(   r   r2   r2   r3   r     s   r   list[dict[str, Any]]c                 C  s  | j pd }|sg S g }t| }r||d tD ]}|dd ||D  qg }t	 }t
|}|D ]F}	t
|	}
|
rI|
|ksI|
|v rJq9|j|	dpU||	| }|r^|j| jkr_q9||
 d|	  d|	 dd	}|jrz|j|d
< || q9|S )uc   Infer direct amendment targets from titles such as 刑法修正案 or 关于修改《X》的决定.r   r%   c                 s  s    | ]}| d V  qdS )r%   N)r  )rW   r  r2   r2   r3   	<genexpr>  s    z-_title_inferred_supersedes.<locals>.<genexpr>r%   r   u   》r   )r'   r%   r_   r(   )r%   rk   _TITLE_AMENDMENT_REr  rn   r  _TITLE_AMEND_PATTERNSrP   finditerr   r   r   r   r#   rU   r(   )ro   r   r%   candidate_titlesr  patternentriesseenown_title_keycandidate_titlenormalized_titlerp   r   r2   r2   r3   r     s<   

r   entries_by_keydict[str, dict[str, Any]]rq   r   r   rN   c                 C  s\   |  |}|s|| |< d S | jd7  _t | ddt | ddkr,|| |< d S d S )Nr   r_   r   )r   r:   r   )r6  rq   r   r   r   r2   r2   r3   r     s   
$r   c                 C  s6   t |}|sdS || v r| jd7  _dS || |< dS )NFr   T)r   r;   )r6  r   r   rq   r2   r2   r3   r   	  s   r   r   c                 C  s   t | d| d| d| ddh}t | dd}t | dd}t | dd}|d	d
 |||fD  dd |D S )Nr   r'   r%   r(   )r   r'   r%   r(   r'   r+  r   c                 s  s    | ]}|r|V  qd S rv   r2   r   r2   r2   r3   r*  $  s    z_entry_keys.<locals>.<genexpr>c                 S  r   r2   r2   r   r2   r2   r3   r   %  r   z_entry_keys.<locals>.<setcomp>)r   r   update)r   r   
number_keyrr   url_keyr2   r2   r3   r     s   r   r1  Iterable[dict[str, Any]]c                 C  s"   t  }| D ]	}|t| q|S rv   )r   r9  r   )r1  r   r   r2   r2   r3   r   (  s   r   c                 C  s8   t | }t| dd}||v r|S |rt|d S dS )Nr   )r   r   r   )r   r   r   sorted)r   r   id_keyr2   r2   r3   r   /  s
   r   valuelist[dict[str, Any]] | Nonec                 C  s   t | tr	| r	| S d S rv   )r   r[   )r?  r2   r2   r3   r   7  s   r   mappingdict[str, str]c                 C  s:   i }|   D ]\}}||t|< t|}|r|||< q|S rv   )r  r   r   )rA  expandedr   r%   r   r2   r2   r3   r   ;  s   r   r'   c                 C  s   | pd dd ddS )Nr   r   r   )r   r8  r2   r2   r3   r   E  s   r   )ro   r
   r   ra   rM   r   )ro   r
   r   rV   r   ra   rM   r   )r   FNr   )
rt   r   r   r"   r   r   r   r"   rM   r4   )F)
rt   r   r   ra   r   r   r   r   rM   r4   )r   ra   r   r   rM   r  )r   r\   rM   r  )ro   r   rM   r   )ro   r   rM   r  )rd   r&   rM   r&   rv   )r  r   rp   r!  r_   r&   rM   r   )r  r   r^   r  r   ra   rM   r   )r_   r$   r	  r
   rM   r&   )NN)r   r$   r   r&   rp   r}   r_   r&   r(   r&   rM   rV   )ro   r
   r   ra   rM   r)  )
r6  r7  rq   r$   r   rV   r   r4   rM   rN   )r6  r7  r   rV   r   r4   rM   r   )r   rV   rM   r   )r1  r<  rM   r   )r   rV   rM   r$   )r?  r   rM   r@  )rA  rB  rM   rB  )r'   r&   rM   r$   )<r   
__future__r   r  dataclassesr   r   datetimer   typingr   r   sqlalchemy.ormr   r	   app.models.tax_datar
   1app.services.tax_data_processor.relation_identityr   r   r   r   r   r  r  r  compileSr  r   r%  r$  r,  r-  r!   r4   r\   ra   r   r   r   r   r   r  r   r  rj   r   r   r  r   r   r   r   r   r   r   r   r   r   r2   r2   r2   r3   <module>   s    		
& 
7
rX
I







(






