o
    i                     @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZ d
dlmZmZmZ d
dlmZ dedededdfddZdedededee dee f
ddZde
dededee fddZde
dededdfd d!ZdS )"uP   Phase 2：批量导入文档到知识库，回写 is_imported / knowledge_doc_id    N)Optional)logger)text)Session)SessionLocal)TaxDocument)KnowledgeBaseClient_parse_doc_number_parts   )
BATCH_SIZEDOC_TYPE_MAPREQUEST_DELAY)ImportStatestatebase_urltokenreturnc              	   C   s  |  drtd d S |  }|  }t||d}|  }d}d}td|  	 t }	t|	|t	d}
W d    n1 s@w   Y  |
sHn|
D ]}|j
rT|j
 sntd|j d	|j  |d
7 }|j}| | qJ||j}tt||||}|r|dr|dp|d}t }	t|	|j| W d    n1 sw   Y  |d
7 }td| d|j d|  n|r|dnd}td|j d	|j d|  |j}| | tt qJq+td| d| d | d d S )N2u   Phase 2 已完成，跳过)r   api_keyr   u   Phase 2 开始，cursor=T)after_id
batch_sizeu   跳过空内容文档: id=z title=r
   successknowledge_doc_iddocument_id[u   ] 导入成功: id=u    → knowledge_doc_id=errorunknownu   导入失败: id=z error=u   ✓ Phase 2 完成：导入 u    条，跳过 u    条)is_phase_doner   info	get_kb_idget_category_mapr   get_phase2_cursorr   _fetch_batchr   content_textstripwarningidtitleset_phase2_cursorgetcategory_idasynciorun_import_one
_writebackr   timesleepr   mark_phase_done)r   r   r   kb_idcategory_mapclientcursortotal_importedtotal_skippeddbbatchdoc	kb_cat_idresultr   r    r=   U/lsinfo/ai/hellotax_ai/data_center/backend/app/services/import_kb/phase2_documents.py
run_phase2   sP   


  
 r?   r4   r:   r2   r*   c              	      s0  t |j}dg}|jr|d|j d |jr#|d|j d |jr2|d|j  d |jrA|d|j  d |jrN|d|j d |jr[|d|j d |j	rh|d	|j	 d |d
|j
 d |d|j  |d d|d }|jpg }t|}|rtdd |D nd}	d}
|rdg}t|dD ]i\}}|dd| }|dd}|dd }|ddd }|dk r| dn|d  d}|r|| d| d| d|rd | d!| dnd  q|| d"| |rd | d!| dnd  qd|}
||j |
 }| jd5i d#|d$|jd%|d&|j
d'|jd(|d( d)|d) d*|jd+|jrN|j ndd,|jrZ|j ndd-|jp}d.d/|d0|	d1|jd2dd3t|jd4I dH S d/|d0|	d1|jd2dd3t|jd4I dH S )6uB   构建 frontmatter 并调用 KnowledgeBaseClient.import_document()z---zdoc_number: ""zissuing_authority: "zissue_date: "zeffective_date: "zdoc_status: "zsuperseded_by: "zsuperseded_by_title: "zsource_url: "ztax_category_id: 
z

c                 S   s&   h | ]}| d r| d d qS )type )r)   lower).0ar=   r=   r>   	<setcomp>d   s   & z_import_one.<locals>.<setcomp>NrC   u   

## 附件r
   nameu   附件urlrB   sizer   i   KBMBz. [z]()z (z, z. r*   r'   content
source_url
doc_numberdoc_number_yeardoc_number_serialissuing_authority
issue_dateeffective_date
doc_status	effectivehas_attachmentattachment_typescontent_hashversion_numberdoc_typenoticer=   )r	   rP   appendrS   rT   	isoformatrU   rV   superseded_by_doc_numbersuperseded_by_titlerO   r*   joinattachmentsboollist	enumerater)   upperr#   create_documentr'   rZ   r   )r4   r:   r2   r*   	num_parts
meta_linesfrontmatterrc   has_att	att_typesatt_section	att_linesiattrH   rI   ftypesize_kbsize_strfull_contentr=   r=   r>   r-   D   s   


 82

	


r-   r8   r   r   c                 C   sF   |  ttjdktjdktjd tj|ktj	 
| S )N	completedF)queryr   filterprocessing_statusis_importedr#   isnotr&   order_byasclimitall)r8   r   r   r=   r=   r>   r"      s   
r"   
tax_doc_idr   c                 C   s"   |  td||d |   d S )NzMUPDATE tax_documents SET is_imported=true, knowledge_doc_id=:kid WHERE id=:id)kidr&   )executer   commit)r8   r   r   r=   r=   r>   r.      s
   r.   ) __doc__r+   r/   typingr   logurur   
sqlalchemyr   sqlalchemy.ormr   app.databaser   app.models.tax_datar   5app.services.tax_data_processor.knowledge_base_clientr   r	   configr   r   r   r   r   strr?   intdictr-   re   r"   r.   r=   r=   r=   r>   <module>   s4    3
H