o
    "Æi{-  ã                   @   sš   d Z ddlmZmZmZmZ ddlmZ ddlZddl	m
Z
mZmZ ddlmZmZmZmZmZ ddlmZ e e¡ZG dd	„ d	ƒZdad
efdd„ZdS )uT   
LangChainæ–‡æ¡£å¤„ç†æœåŠ¡
è´Ÿè´£æ–‡æ¡£åŠ è½½ã€åˆ†å—ã€å‘é‡åŒ–ç­‰æ ¸å¿ƒåŠŸèƒ½
é    )ÚListÚDictÚOptionalÚAny)ÚPathN)ÚRecursiveCharacterTextSplitterÚCharacterTextSplitterÚTokenTextSplitter)Ú
TextLoaderÚPyPDFLoaderÚUnstructuredWordDocumentLoaderÚUnstructuredMarkdownLoaderÚUnstructuredHTMLLoader)ÚDocumentc                   @   s†  e Zd ZdZeeeeeeedœZ	e
ddg d¢dœdœedddd	œdœed
ddœdœdœZdd„ Zdedee fdd„Zd&dedee dee fdd„Z			d'dee dedee dee dee f
dd„Z				d(dededee dee dee deeeef  fd d!„Z				d(dededee dee dee deeeef  fd"d#„Z			d)dededededeeef f
d$d%„ZdS )*ÚDocumentProcessoru<   æ–‡æ¡£å¤„ç†å™¨ - ä½¿ç”¨LangChainè¿›è¡Œæ–‡æ¡£åŠ è½½å’Œåˆ†å—)z.txtz.pdfz.docxz.docz.mdz.htmlz.htméè  éÈ   )
z

Ú
u   ã€‚u   ï¼u   ï¼ŸÚ.ú!Ú?Ú Ú )Ú
chunk_sizeÚchunk_overlapÚ
separators)ÚclassÚdefault_paramsr   )r   r   Ú	separatoriô  é2   )r   r   )Ú	recursiveÚ	characterÚtokenc                 C   s   d| _ d| _d| _dS )u   åˆå§‹åŒ–æ–‡æ¡£å¤„ç†å™¨r    r   r   N)Údefault_splitterÚdefault_chunk_sizeÚdefault_chunk_overlap)Úself© r'   úU/lsinfo/ai/hellotax_ai/base_platform/app/services/rag/langchain_document_processor.pyÚ__init__C   s   
zDocumentProcessor.__init__Ú	file_pathÚreturnc              
   C   s–   z2t |ƒ}|j ¡ }|| jvrtd|› ƒ‚| j| }||ƒ}| ¡ }t d|› dt|ƒ› ¡ |W S  t	yJ } zt 
d|› d|› ¡ ‚ d}~ww )u”   
        åŠ è½½æ–‡æ¡£

        Args:
            file_path: æ–‡æ¡£æ–‡ä»¶è·¯å¾„

        Returns:
            LangChain Documentå¯¹è±¡åˆ—è¡¨
        zUnsupported file type: zSuccessfully loaded document: z, pages/paragraphs: zFailed to load document ú: N)r   ÚsuffixÚlowerÚLOADER_MAPPINGÚ
ValueErrorÚloadÚloggerÚinfoÚlenÚ	ExceptionÚerror)r&   r*   ÚpathÚfile_extensionÚloader_classÚloaderÚ	documentsÚer'   r'   r(   Úload_documentI   s   



€þzDocumentProcessor.load_documentNÚtextÚmetadatac                 C   s   t ||pi d}|gS )u»   
        ä»Žæ–‡æœ¬å­—ç¬¦ä¸²åˆ›å»ºæ–‡æ¡£

        Args:
            text: æ–‡æœ¬å†…å®¹
            metadata: å…ƒæ•°æ®

        Returns:
            LangChain Documentå¯¹è±¡åˆ—è¡¨
        ©Úpage_contentr?   )ÚLangChainDocument)r&   r>   r?   Údocr'   r'   r(   Ú	load_texte   s
   þzDocumentProcessor.load_textr    r;   Ústrategyr   r   c              
   K   s  z²|dv r\ddl m}m} g }|D ]7}	|dkr|jn	|dkr!|jn|j}
|j|	j|p+d|p.d|
|	j 	d¡d	}|D ]}| 
t||	j ¡ d
¡ q9qt dt|ƒ› dt|ƒ› d¡ |W S || jvrpt d|› d| j› ¡ | j}| j| }|d }|d  ¡ }|dur‡||d< |dur||d< | |¡ |di |¤Ž}| |¡}t dt|ƒ› dt|ƒ› d¡ |W S  tyÇ } z	t d|› ¡ ‚ d}~ww )u‹  
        åˆ†å‰²æ–‡æ¡£ä¸ºå°å—

        Args:
            documents: LangChain Documentå¯¹è±¡åˆ—è¡¨
            strategy: åˆ†å—ç­–ç•¥ ('recursive', 'character', 'token', 'tax_article', 'tax_clause', 'tax_chapter')
            chunk_size: å—å¤§å°
            chunk_overlap: é‡å å¤§å°
            **kwargs: å…¶ä»–åˆ†å—å‚æ•°

        Returns:
            åˆ†å—åŽçš„Documentåˆ—è¡¨
        )Útax_articleÚ
tax_clauseÚtax_chapterr   )ÚTextSplitterServiceÚSplitterTyperF   rG   r   r   Útitle)r>   r   r   Úsplitter_typeÚdocument_titler@   z"Tax document splitting completed: z	 docs -> z chunkszUnknown chunking strategy: z, using default strategy: r   r   Nr   r   zDocument chunking completed: z documents -> zDocument chunking failed: r'   )Ú,app.services.knowledge.text_splitter_servicerI   rJ   ÚTAX_ARTICLEÚ
TAX_CLAUSEÚTAX_CHAPTERÚ
split_textrA   r?   ÚgetÚappendrB   Úcopyr2   r3   r4   ÚSPLITTER_CONFIGSÚwarningr#   ÚupdateÚsplit_documentsr5   r6   )r&   r;   rE   r   r   ÚkwargsrI   rJ   Úresult_chunksrC   rL   ÚchunksÚ
chunk_textÚconfigÚsplitter_classr   Úsplitterr<   r'   r'   r(   rY   v   sX   þ
û
þÿ 



 €þz!DocumentProcessor.split_documentsc              
   K   s¢   z8|   |¡}|r|D ]}|j |¡ q
| j|f|||dœ|¤Ž}	g }
t|	ƒD ]\}}|
 |j|j|dœ¡ q&|
W S  tyP } zt 	d|› d|› ¡ ‚ d}~ww )u’  
        å®Œæ•´å¤„ç†æ–‡ä»¶ï¼šåŠ è½½ + åˆ†å—

        Args:
            file_path: æ–‡ä»¶è·¯å¾„
            strategy: åˆ†å—ç­–ç•¥
            chunk_size: å—å¤§å°
            chunk_overlap: é‡å å¤§å°
            metadata: é¢å¤–çš„å…ƒæ•°æ®
            **kwargs: å…¶ä»–åˆ†å—å‚æ•°

        Returns:
            å¤„ç†åŽçš„æ–‡æ¡£å—åˆ—è¡¨ï¼Œæ¯ä¸ªå—åŒ…å« {text, metadata, chunk_index}
        ©rE   r   r   ©r>   r?   Úchunk_indexzFailed to process file r,   N)
r=   r?   rX   rY   Ú	enumeraterT   rA   r5   r2   r6   )r&   r*   rE   r   r   r?   rZ   r;   rC   r\   ÚresultÚidxÚchunkr<   r'   r'   r(   Úprocess_fileÅ   s6   
ÿüû	
ý€þzDocumentProcessor.process_filec              
   K   s„   z,|   ||¡}| j|f|||dœ|¤Ž}g }	t|ƒD ]\}
}|	 |j|j|
dœ¡ q|	W S  tyA } z	t d|› ¡ ‚ d}~ww )uZ  
        å®Œæ•´å¤„ç†æ–‡æœ¬ï¼šåˆ›å»ºæ–‡æ¡£ + åˆ†å—

        Args:
            text: æ–‡æœ¬å†…å®¹
            strategy: åˆ†å—ç­–ç•¥
            chunk_size: å—å¤§å°
            chunk_overlap: é‡å å¤§å°
            metadata: å…ƒæ•°æ®
            **kwargs: å…¶ä»–åˆ†å—å‚æ•°

        Returns:
            å¤„ç†åŽçš„æ–‡æ¡£å—åˆ—è¡¨
        ra   rb   zFailed to process text: N)	rD   rY   rd   rT   rA   r?   r5   r2   r6   )r&   r>   rE   r   r   r?   rZ   r;   r\   re   rf   rg   r<   r'   r'   r(   Úprocess_textý   s0   ÿüû	
ý€þzDocumentProcessor.process_textc           	   
   K   s¤   z<| j |f|||dœ|¤Ž}dd„ |D ƒ}t|ƒ|r"t|ƒt|ƒ nd|r)t|ƒnd|r0t|ƒnd|dd… |||dœW S  tyQ } z	t d|› ¡ ‚ d}~ww )	uN  
        é¢„è§ˆåˆ†å—æ•ˆæžœï¼ˆç”¨äºŽPipelineé…ç½®é¡µé¢ï¼‰

        Args:
            text: ç¤ºä¾‹æ–‡æœ¬
            strategy: åˆ†å—ç­–ç•¥
            chunk_size: å—å¤§å°
            chunk_overlap: é‡å å¤§å°
            **kwargs: å…¶ä»–å‚æ•°

        Returns:
            é¢„è§ˆç»“æžœï¼ŒåŒ…å«ç»Ÿè®¡ä¿¡æ¯å’Œç¤ºä¾‹å—
        ra   c                 S   s   g | ]}t |d  ƒ‘qS )r>   )r4   )Ú.0rg   r'   r'   r(   Ú
<listcomp>O  s    z6DocumentProcessor.preview_chunking.<locals>.<listcomp>r   Né   )Útotal_chunksÚavg_chunk_lengthÚmin_chunk_lengthÚmax_chunk_lengthr\   rE   r   r   zFailed to preview chunking: )ri   r4   ÚsumÚminÚmaxr5   r2   r6   )	r&   r>   rE   r   r   rZ   r\   Úchunk_lengthsr<   r'   r'   r(   Úpreview_chunking0  s2   ÿüû	
ø€þz"DocumentProcessor.preview_chunking)N)r    NN)r    NNN)r    r   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r
   r   r   r   r   r/   r   r   r	   rV   r)   Ústrr   rB   r=   r   r   rD   ÚintrY   r   rh   ri   ru   r'   r'   r'   r(   r      sº    ùýþ	ýþ	þþï ûþýüû
ùRúþýüûú
ø;úþýüûú
ø6ûþýüû
ùr   r+   c                   C   s   t du rtƒ a t S )u-   èŽ·å–æ–‡æ¡£å¤„ç†å™¨å®žä¾‹ï¼ˆå•ä¾‹æ¨¡å¼ï¼‰N)Ú_document_processorr   r'   r'   r'   r(   Úget_document_processore  s   r}   )ry   Útypingr   r   r   r   Úpathlibr   ÚloggingÚlangchain_text_splittersr   r   r	   Ú$langchain_community.document_loadersr
   r   r   r   r   Úlangchain_core.documentsr   rB   Ú	getLoggerrv   r2   r   r|   r}   r'   r'   r'   r(   Ú<module>   s    
  J