
    Xj!                         d dl mZ d dlmZ d dlmZmZmZmZm	Z	 d dl
mZ d dlmZmZmZ d dlmZ  ee          Z G d d          Zd	ad
efdZd	S )    )Path)Any)PyPDFLoader
TextLoaderUnstructuredHTMLLoaderUnstructuredMarkdownLoaderUnstructuredWordDocumentLoader)Document)CharacterTextSplitterRecursiveCharacterTextSplitterTokenTextSplitter)
get_loggerc                      e Zd ZeeeeeeedZe	ddg ddde
ddddded	d
dddZd Zdedee         fdZddededz  dee         fdZ	 	 	 ddee         dededz  dedz  dee         f
dZ	 	 	 	 d dedededz  dedz  dedz  deeeef                  fdZ	 	 	 	 d dedededz  dedz  dedz  deeeef                  fdZ	 	 	 d!dededededeeef         f
dZdS )"DocumentProcessor)z.txtz.pdfz.docxz.docz.mdz.htmlz.htm     )
z


u   。u   ！u   ？.!?  )
chunk_sizechunk_overlap
separators)classdefault_paramsr   )r   r   	separatori  2   )r   r   )	recursive	charactertokenc                 0    d| _         d| _        d| _        d S )Nr    r   r   )default_splitterdefault_chunk_sizedefault_chunk_overlap)selfs    U/lsinfo/ai/hellotax_ai/base_platform/app/services/rag/langchain_document_processor.py__init__zDocumentProcessor.__init__6   s      +"&%("""    	file_pathreturnc                    	 t          |          }|j                                        }|| j        vrt	          d|           | j        |         } ||          }|                                }t                              d| dt          |                      |S # t          $ r&}t          
                    d| d|             d }~ww xY w)NzUnsupported file type: zSuccessfully loaded document: z, pages/paragraphs: zFailed to load document : )r   suffixlowerLOADER_MAPPING
ValueErrorloadloggerinfolen	Exceptionerror)r'   r+   pathfile_extensionloader_classloader	documentses           r(   load_documentzDocumentProcessor.load_document;   s    		??D![..00NT%888 !K>!K!KLLL.~>L!\),,FIKK```PST]P^P^``    	 	 	LLDIDDDDEEE	s   BB   
C*!CCNtextmetadatac                 .    t          ||pi           }|gS )Npage_contentrA   )LangChainDocument)r'   r@   rA   docs       r(   	load_textzDocumentProcessor.load_textL   s    THNKKKur*   r    r=   strategyr   r   c           
         	 |dv rddl m}m} g }|D ]}	|dk    r|j        n|dk    r|j        n|j        }
|                    |	j        |pd|pd|
|	j        	                    d          	          }|D ]=}|
                    t          ||	j                                        
                     >t                              dt          |           dt          |           d           |S || j        vr,t                              d| d| j                    | j        }| j        |         }|d         }|d                                         }|||d<   |||d<   |                    |            |di |}|                    |          }t                              dt          |           dt          |           d           |S # t*          $ r#}t                              d|             d }~ww xY w)N)tax_article
tax_clausetax_chapterr   )SplitterTypeTextSplitterServicerJ   rK   r   r   title)r@   r   r   splitter_typedocument_titlerC   z"Tax document splitting completed: z	 docs -> z chunkszUnknown chunking strategy: z, using default strategy: r   r   r   r   zDocument chunking completed: z documents -> zDocument chunking failed:  ),app.services.knowledge.text_splitter_servicerM   rN   TAX_ARTICLE
TAX_CLAUSETAX_CHAPTER
split_textrD   rA   getappendrE   copyr4   r5   r6   SPLITTER_CONFIGSwarningr$   updatesplit_documentsr7   r8   )r'   r=   rH   r   r   kwargsrM   rN   result_chunksrF   rP   chunks
chunk_textconfigsplitter_classr   splitterr>   s                     r(   r^   z!DocumentProcessor.split_documentsP   s   7	GGG        !#$  C $}44 %00  (<77 )33!-!9 " 1;; -#-#5&3&:s&3'*|'7'7'@'@ <  F '-  
%,,-:PSP\PaPaPcPcddd    mYmmRUVcRdRdmmm   %$t444m(mmVZVkmm    0*84F#G_N#$45::<<N%/9|,(2?/!!&)))%~7777H--i88FKKbIbbcRXkkbbb   M 	 	 	LL9a99:::	s   C+G .C G 
G<G77G<c                 r   	 |                      |          }|r|D ]}|j                            |            | j        |f|||d|}	g }
t	          |	          D ](\  }}|
                    |j        |j        |d           )|
S # t          $ r&}t          	                    d| d|             d }~ww xY w)NrH   r   r   r@   rA   chunk_indexzFailed to process file r.   )
r?   rA   r]   r^   	enumeraterY   rD   r7   r4   r8   )r'   r+   rH   r   r   rA   r_   r=   rF   ra   resultidxchunkr>   s                 r(   process_filezDocumentProcessor.process_file   s   	**955I 2$ 2 2CL''1111)T)!%+	 
  F F'//  
U"/U^\_``    M 	 	 	LLC9CCCCDDD	s   BB 
B6!B11B6c                 ,   	 |                      ||          } | j        |f|||d|}g }	t          |          D ](\  }
}|	                    |j        |j        |
d           )|	S # t          $ r#}t                              d|             d }~ww xY w)Nrg   rh   zFailed to process text: )	rG   r^   rj   rY   rD   rA   r7   r4   r8   )r'   r@   rH   r   r   rA   r_   r=   ra   rk   rl   rm   r>   s                r(   process_textzDocumentProcessor.process_text   s    	tX66I)T)!%+	 
  F F'//  
U"/U^\_``    M 	 	 	LL7A77888	s   A#A& &
B0BBc           	      f   	  | j         |f|||d|}d |D             }t          |          |rt          |          t          |          z  nd|rt          |          nd|rt	          |          nd|d d         |||dS # t
          $ r#}t                              d|             d }~ww xY w)Nrg   c                 8    g | ]}t          |d                    S )r@   )r6   ).0rm   s     r(   
<listcomp>z6DocumentProcessor.preview_chunking.<locals>.<listcomp>   s$    DDDESv//DDDr*   r      )total_chunksavg_chunk_lengthmin_chunk_lengthmax_chunk_lengthra   rH   r   r   zFailed to preview chunking: )rp   r6   summinmaxr7   r4   r8   )	r'   r@   rH   r   r   r_   ra   chunk_lengthsr>   s	            r(   preview_chunkingz"DocumentProcessor.preview_chunking   s	   	&T&!%+	 
  F EDVDDDM #FOU$\C$6$6]9K9K$K$K[\:@$GC$6$6$6a:@$GC$6$6$6a !*$(!.	 	 	  	 	 	LL;;;<<<	s   B B 
B0B++B0N)r    NN)r    NNN)r    r   r   )__name__
__module____qualname__r   r   r	   r   r   r1   r   r   r   r[   r)   strlistrE   r?   dictrG   intr^   r   rn   rp   r~   rR   r*   r(   r   r      s       /.)'& N 4"!$YYY 
 
 +-1CVZ[[
 

 '-02FF
 
 &) ) )
s t4E/F    " c TD[ DIZD[     $!%$(? ?)*? ? $J	?
 Tz? 
	 ? ? ? ?H $!%$( $   $J	
 Tz + 
d38n	   D $!%$( $   $J	
 Tz + 
d38n	   > $    	
  
c3h     r*   r   Nr,   c                  :    t           t                      a t           S r   )_document_processorr   rR   r*   r(   get_document_processorr      s    "/11r*   )pathlibr   typingr   $langchain_community.document_loadersr   r   r   r   r	   langchain_core.documentsr
   rE   langchain_text_splittersr   r   r   common_loggingr   r   r4   r   r   r   rR   r*   r(   <module>r      s8                            C B B B B B          & % % % % %	H		P P P P P P P Pf   1      r*   