
    Xj                       |    d dl mZ d dlmZ  ee          Z G d dee          Z G d d          Zda	defd	Z
dS )
    )Enum)
get_loggerc                   "    e Zd ZdZdZdZdZdZdS )SplitterType	recursive	charactertokenmarkdowntax_adaptiveN)__name__
__module____qualname__	RECURSIVE	CHARACTERTOKENMARKDOWNTAX_ADAPTIVE     T/lsinfo/ai/hellotax_ai/base_platform/app/services/knowledge/text_splitter_service.pyr   r   
   s'        IIEH!LLLr   r   c                   @   e Zd Zeddej        ddddfdedededed	ed
edz  dedz  dedz  fd            Zedededede	e         fd            Z
edededed	ede	e         f
d            Zedededede	e         fd            Zedededede	e         fd            Zedededed
edz  de	e         f
d            Zededededed
edz  de	e         fd            Zeddej        ddddfdedededededz  d
edz  dedz  dedz  de	e         fd            ZdS )TextSplitterServicei     

Ntext
chunk_sizechunk_overlapsplitter_type	separatordocument_titledocument_iddocument_numberc                    	 |t           j        k    rt                              | ||          S |t           j        k    rt                              | |||          S |t           j        k    rt                              | ||          S |t           j        k    rt          	                    | ||          S |t           j
        k    r!t                              | |pd|pd|          S t                              d| d           t                              | ||          S # t          $ r*}t                              d|            | gcY d }~S d }~ww xY w)Nunknown zUnknown splitter type: z", using default recursive splitterzText splitting failed: )r   r   r   _split_recursiver   _split_characterr   _split_tokenr   _split_markdownr   _split_tax_adaptiveloggerwarning	Exceptionerror)	r   r   r   r   r   r    r!   r"   es	            r   
split_textzTextSplitterService.split_text   sn   	 666*;;D*m\\\,"888*;;*mY   ,"444*77j-XXX,"777*::4][[[,";;;*>>+2O4Ir>   _m___   +;;D*m\\\ 	 	 	LL61667776MMMMMM	s:   +D ,D +D +D 30D $9D 
E(EEEreturnc                     ddl m}  |||t          g d          }|                    |           }t                              dt          |            dt          |           d           |S )Nr   )RecursiveCharacterTextSplitter)
r   
u   。u   ！u   ？.!? r%   )r   r   length_function
separatorszRecursive splitting completed:  characters ->  chunks)langchain_text_splittersr3   lenr0   r+   info)r   r   r   r3   splitterchunkss         r   r&   z$TextSplitterService._split_recursive7   s    KKKKKK11!'RRR	
 
 
 $$T**\c$ii\\F\\\	
 	
 	
 r   c                     ddl m}  ||||t                    }|                    |           }t                              dt          |            dt          |           d           |S )Nr   )CharacterTextSplitter)r   r   r   r9   zCharacter splitting completed: r;   r<   )r=   rC   r>   r0   r+   r?   )r   r   r   r   rC   r@   rA   s          r   r'   z$TextSplitterService._split_characterG   s     	CBBBBB((!'	
 
 
 $$T**\c$ii\\F\\\	
 	
 	
 r   c                     ddl m}  |||          }|                    |           }t                              dt          |            dt          |           d           |S )Nr   )TokenTextSplitterr   r   zToken splitting completed: r;   r<   )r=   rE   r0   r+   r?   r>   )r   r   r   rE   r@   rA   s         r   r(   z TextSplitterService._split_tokenY   st    >>>>>>$$
-XXX$$T**`#d))``CPVKK```aaar   c                     ddl m}  |||          }|                    |           }t                              dt          |            dt          |           d           |S )Nr   )MarkdownTextSplitterrF   zMarkdown splitting completed: r;   r<   )r=   rH   r0   r+   r?   r>   )r   r   r   rH   r@   rA   s         r   r)   z#TextSplitterService._split_markdownb   st    AAAAAA'':][[[$$T**cSYYccsSY{{cccdddr   c                    ddl m}  |            }d}|rd|v sd|v rd}n
d|v sd|v rd	}|                    | |||
          }g }|D ]H}	|                    |	j        |	j        |	j        |	j        |	j        |	j	        |	j
        |	j        d           It                              dt          |            dt          |           dt          d |D                        dt          d |D                        d	           |S )Nr   )TaxAdaptiveSplitterlawu   公告u   通知announcementu   案例u   判决case)r   r!   r"   doc_type)r   chunk_id	is_parentparent_chunk_idchunk_levelchunk_index
referencesmetadataz"Tax adaptive splitting completed: r;   z	 chunks (c              3   *   K   | ]}|d          
dV  dS rP      Nr   .0cs     r   	<genexpr>z:TextSplitterService._split_tax_adaptive.<locals>.<genexpr>   s\        eJ  eJklz{  }H  {I  eJef  eJ  eJ  eJ  eJ  eJ  eJr   z parents + c              3   *   K   | ]}|d          
dV  dS rW   r   rY   s     r   r\   z:TextSplitterService._split_tax_adaptive.<locals>.<genexpr>   sh        ZC  ZC  ab  tu  vA  tB  ZC  [\  ZC  ZC  ZC  ZC  ZC  ZCr   z
 children)),app.services.knowledge.tax_adaptive_splitterrJ   splitappendr   rO   rP   rQ   rR   rS   rT   rU   r+   r?   r>   sum)
r   r!   r"   r    rJ   r@   rN   
tax_chunksrA   chunks
             r   r*   z'TextSplitterService._split_tax_adaptivek   s    	UTTTTT&&(( 	">))X-G-G)^++x>/I/I!^^;Zb $ 
 

  	 	EMM!J %!&','<#(#4#(#4"'"2 %	 	    	 OT  O  O3v;;  O  Oad  eJ  eJpv  eJ  eJ  eJ  bJ  bJ  O  O  WZ  ZC  ZC  fl  ZC  ZC  ZC  WC  WC  O  O  O	
 	
 	
 r   granularityc           	          ddl m}  ||d|||          }|                    |           }t                              dt          |            dt          |           d| d           |S )	Nr   )TaxDocumentSplitterT)rd   add_contextr    r   r   z"Tax document splitting completed: r;   z chunks (granularity: )),app.services.knowledge.tax_document_splitterrf   r0   r+   r?   r>   )r   rd   r   r   r    rf   r@   rA   s           r   
_split_taxzTextSplitterService._split_tax   s     	UTTTTT '&#)!'
 
 
 $$T**|T||3v;;||ny|||	
 	
 	
 r   window_sizec                    |t           j        k    r4t                              | |||||rt	          |          nd |          }|S t                              | |||||rt	          |          nd |          }g }	t          |          D ]N\  }
}||
t          |          t          |          |||
t          |          dd}|	                    |           O|	S )N)r   r   r   r   r    r!   r"   )r!   r    rS   total_chunks)r   rS   rm   r   rU   )r   r   r   r0   str	enumerater>   r`   )r   r   r   r   r!   r    r"   rk   rA   resultirc   
chunk_datas                r   create_chunks_with_metadataz/TextSplitterService.create_chunks_with_metadata   s    L555(33%++-0;EC,,, / 4  F M$//!''),7AK(((T+ 0 
 
 !&)) 	& 	&HAu  #F!%jj#.&4#$$'KK	  J MM*%%%%r   )r   r   r   staticmethodr   r   rn   intr0   listr&   r'   r(   r)   dictr*   rj   rs   r   r   r   r   r      s         &2&<%)"&&*        $	 
   d
  4Z  t      \ D s  C DQTI    \ "36CF	c   \" 3 C  S	    \ c s 3 4PS9    \    # 69 KNQU: 	d      \ D   	
 d
 
c   \.   &2&<"&%)&*"&- --- - $	-
 4Z- d
- t- 4Z- 
d- - - \- - -r   r   Nr1   c                  :    t           t                      a t           S )N)_text_splitter_servicer   r   r   r   get_text_splitter_servicerz      s    %!4!6!6!!r   )enumr   common_loggingr   r   r+   rn   r   r   ry   rz   r   r   r   <module>r}      s          % % % % % %	H		
" " " " "3 " " "B B B B B B B BJ  "#6 " " " " " "r   