
    Xj                     x    d dl Z d dlZd dlmZ  ee          Z G d dej                  Z G d d          Z	dS )    N)
get_loggerc                   2    e Zd Zd Zd Zd Zd Zd Zd ZdS )TaxMarkdownConverterc                 ,    dt          |          z   dz   S )N

)strselfeltextconvert_as_inlines       ^/lsinfo/ai/hellotax_ai/data_center/backend/app/services/tax_data_processor/document_cleaner.pyconvert_tablez"TaxMarkdownConverter.convert_table   s    B&((    c                     dS N  r	   s       r   
convert_trzTaxMarkdownConverter.convert_tr       rr   c                     dS r   r   r	   s       r   
convert_tdzTaxMarkdownConverter.convert_td   r   r   c                     dS r   r   r	   s       r   
convert_thzTaxMarkdownConverter.convert_th   r   r   c                     d| dS Nz<strong>z	</strong>r   r	   s       r   convert_strongz#TaxMarkdownConverter.convert_strong       )$))))r   c                     d| dS r   r   r	   s       r   	convert_bzTaxMarkdownConverter.convert_b   r   r   N)	__name__
__module____qualname__r   r   r   r   r   r    r   r   r   r   r   	   sn        ) ) )      * * ** * * * *r   r   c                       e Zd Zd ZdedefdZdedefdZdedefdZdedefd	Zd
edefdZ	e
dedefd            ZdedefdZdedefdZdS )DocumentCleanerc                     d S )Nr   )r
   s    r   __init__zDocumentCleaner.__init__   s    r   html_contentreturnc                 :   	 t          ddddg                              |          }|                     |          }|S # t          $ rP}t                              d|            ddlm}  ||d	          }|                    d
d          cY d }~S d }~ww xY w)NATX-scriptstyle)heading_stylebulletsstripu   HTML 转 Markdown 失败: r   BeautifulSouplxml
T	separatorr1   )	r   convertclean_markdown	Exceptionloggererrorbs4r3   get_text)r
   r(   markdown_contenter3   soups         r   html_to_markdownz DocumentCleaner.html_to_markdown"   s    	=3%QT]egn\opppxx  zF   G   G#223CDD## 	= 	= 	=LL9a99:::)))))) =v66D==4t=<<<<<<<<		=s   =A   
B
ABBBr?   c                    t          j        dd|          }d |                    d          D             }d                    |          }g d}|D ]}t          j        |d|          }t          j        dd|t           j                  }|                     |          }|                     |          }t          j        dd|          }|                                S )	N\n{3,}r   c                 6    g | ]}|                                 S r   )r1   ).0lines     r   
<listcomp>z2DocumentCleaner.clean_markdown.<locals>.<listcomp>/   s     GGG$GGGr   r5   )z$!\[.*?\]\(.*?huibiao.*?\)\s*\S+.*?\nu   \[下载文字版\]\(.*?\)\n?u   \[下载图片版\]\(.*?\)\n?u-   字体：\s*【大】\s*【中】\s*【小】u   分享到：.*u   全文有效u   成文日期：.*u   【打印】u   【下载】u   纠错或建议z [^\n]*\{[^\n]*font[^\n]*\}[^\n]*r   z[^\n]*\{[^}]*font[^}]*\}flags)resubsplitjoinDOTALL_normalize_headings_normalize_listsr1   )r
   r?   linesnoise_patternspatterns        r   r9   zDocumentCleaner.clean_markdown-   s    6)V5EFFGG*:*@*@*F*FGGG99U++ ]  ]  ]% 	E 	EG!vgr3CDD6"?EU]_]fggg334DEE001ABB6)V5EFF%%'''r   contentc                 T   |                     d          }g }|D ]z}|                    d          rN|r!|d         dk    r|                    d           |                    |           |                    d           e|                    |           {d                    |          S )Nr5   #r   )rM   
startswithappendrN   )r
   rU   rR   normalized_linesrG   s        r   rP   z#DocumentCleaner._normalize_headings:   s    d## 	. 	.Ds## .# 0(8(<(B(B$++B/// ''--- ''++++ ''----yy)***r   c                     t          j        dd|t           j                  }t          j        dd|t           j                  }|S )Nz^\s*[\*\+]\s+z- rI   z^\s*(\d+)\.\s+z\1. )rK   rL   	MULTILINE)r
   rU   s     r   rQ   z DocumentCleaner._normalize_listsG   s=    &,dG2<PPP&-wr|TTTr   r   c                     t          j        dd|          }t          j        dd|          }|                                S )Nz\s+ z&[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]r   )rK   rL   r1   )r
   r   s     r   
clean_textzDocumentCleaner.clean_textL   s7    vfc4((vFDQQzz||r   markdownc                     t          j        dd| t           j                  } t          j        dd|           } |                                 S )Nu$   \n\n---\n\n## 正文图片内容\b.*r   rI   u   <!-- 图片\d* OCR -->\n?)rK   rL   rO   r1   )ra   s    r   strip_ocr_contentz!DocumentCleaner.strip_ocr_contentQ   sC    6Er8[][deee67XFF~~r   c                 b   |sdS g fd}t          j        d||t           j                  }t          j        dd|          }t          j        dd|          }t          j        dd|t           j                  }t          j        d	d|          }t          j        d
d|t           j        t           j        z            }t          j        dd|t           j        t           j        z            }t          j        dd|t           j        t           j        z            }t          j        dd|t           j                  }t          j        dd|          }t                    D ]\  }}|                    d| d|          } t          j        dd|          }|                                S )Nr   c                                          |                     d                     dt                    dz
   dS )Nr   z


__TABLE_   z__

)rZ   grouplen)mtabless    r   _save_tablez9DocumentCleaner.markdown_to_rag_text.<locals>._save_table\   s:    MM!''!**%%%9#f++/9999r   z<table[\s\S]*?</table>rI   z!\[.*?\]\(.*?\)z\[([^\]]*)\]\([^)]*\)z\1z
^#{1,6}\s+z\*{1,2}([^*]+)\*{1,2}z<strong>(.*?)</strong>z<b>(.*?)</b>z<em>(.*?)</em>z^\s*---+\s*$z	`([^`]*)`__TABLE___rD   r   )rK   rL   
IGNORECASEr]   rO   	enumeratereplacer1   )r
   ra   rk   r   itablerj   s         @r   markdown_to_rag_textz$DocumentCleaner.markdown_to_rag_textW   s    	2	: 	: 	: 	: 	: v0+xr}]]]v+R66v2E4@@vmRR\BBBv/==v.t2=SUS\C\]]]vneT9RSSSv&t2=29;TUUUv&DEEEvk5$//!&)) 	9 	9HAu<< 01 0 0 0%88DDvi..zz||r   c                     ddl m}  ||d          } |ddg          D ]}|                                 |                    dd          }|                     |          }|S )	Nr   r2   r4   r-   r.   r5   Tr6   )r=   r3   	decomposer>   r`   )r
   r(   r3   rA   r-   r   s         r   extract_plain_textz"DocumentCleaner.extract_plain_textn   s    %%%%%%}\622dHg.// 	 	F}}t4}88t$$r   N)r!   r"   r#   r'   r   rB   r9   rP   rQ   r`   staticmethodrc   rs   rv   r   r   r   r%   r%      s?         	=S 	=S 	= 	= 	= 	=(s (s ( ( ( (+3 +3 + + + +     
s s    
  C  C       \ 
S S    .s s      r   r%   )
rK   markdownify_markdownifycommon_loggingr   r!   r;   MarkdownConverterr   r%   r   r   r   <module>r|      s    				 " " " " % % % % % %	H		* * * * *<9 * * *(X X X X X X X X X Xr   