
    j                         d dl Z d dlmZ d dlmZ ddlmZ ddlmZ ddl	m
Z
  e j        e          Ze G d d	                      Z G d
 d          ZdS )    N)	dataclass)Any   )ParentChildSplitStrategy)AdaptivePropositionSplitter)TaxReferenceExtractorc                       e Zd ZU eed<   eed<   eed<   edz  ed<   eed<   eed<   eeee	f                  ed<   eee	f         ed	<   dS )
TaxChunkchunk_idtext	is_parentNparent_chunk_idchunk_levelchunk_index
referencesmetadata)
__name__
__module____qualname__str__annotations__boolintlistdictr        T/lsinfo/ai/hellotax_ai/base_platform/app/services/knowledge/tax_adaptive_splitter.pyr
   r
      s         MMM
IIIOOO4ZT#s(^$$$$38nr   r
   c                       e Zd Z	 	 	 	 ddedededefd	Z	 ddededededee         f
dZdedededee	ee
f                  fdZdeee	ee
f                           dee	ee
f                  fdZdS )TaxAdaptiveSplitter          window_sizemin_child_sizemax_child_sizetarget_parent_sizec                     || _         || _        || _        || _        t	          ||||z   dz            | _        t                      | _        t                      | _	        d S )N   )min_sizemax_sizetarget_size)
r%   r&   r'   r(   r   proposition_splitterr   parent_child_strategyr   reference_extractor)selfr%   r&   r'   r(   s        r   __init__zTaxAdaptiveSplitter.__init__   sr     ',,"4$?##'.8Q>%
 %
 %
!
 &>%?%?"#8#:#:   r    lawr   document_iddocument_numberdoc_typereturnc                 b   |r|                                 st                              d|            g S t                              d| d|            | j                            ||          }|st                              d|            g S t                              dt          |           d           | j                            |||          }t                              d	t          |           d
           | j	        
                    || j                  \  }}t                              dt          |           dt          |           d           i |D ]*}	|                     |	j        ||          }
|
r
|
|	j        <   +g }t          |          D ]\  }}|                     fd|j        D                       }t%          |j        |j        dd d|||j        |j        |j        |j        t          |j                  d|j                  }|                    |           t          |          }t          |          D ]\  }}	                    |	j        g           }
t%          |	j        |	j        d|	j        |	j        ||z   |
|	j        |	j        t          |	j                  d|	j                  }|                    |           t                              dt          |           dt          |           dt          |           d           |S )NzEmpty text for document zSplitting document z with type )r7   z!No chunks generated for document z
Generated z proposition chunksr   doc_id
doc_numberz
Extracted z document-level references)chunksr%   zCreated z parent chunks and z child chunksc                 <    g | ]}                     |g           S r   )get).0cidchild_references_maps     r   
<listcomp>z-TaxAdaptiveSplitter.split.<locals>.<listcomp>I   s*    OOOs%))#r22OOOr   Tparent)r5   article_rangechapter	child_ids
char_count)r   r   r   r   r   r   r   r   F)r5   levelrH   z total TaxChunks (z parents + z
 children))striploggerwarninginfor.   splitlenr0   extract_relationsr/   create_parent_child_pairsr%   _extract_chunk_referencesr   child_id	enumerate_merge_referencesrG   r
   	parent_idr5   rE   rF   r   appendr?   rI   )r1   r   r5   r6   r7   proposition_chunksdoc_referencesparentschildrenchildrefs
tax_chunksidxrD   parent_refs	tax_chunkparent_countrB   s                    @r   rN   zTaxAdaptiveSplitter.split-   s     	4::<< 	NNCkCCDDDIL+LL(LLMMM!6<<TH<UU! 	NNL{LLMMMIM%7!8!8MMMNNN1CCko D 
 
 	P^!4!4PPPQQQ 6PP%43C Q 
 
 	\s7||\\H\\\]]]! 	< 	<E11%*k?[[D <7;$U^4
$W-- 	) 	)KC00OOOOf>NOOO K !)[ $$&#)#5%+%9%~!'!1"%fk"2"2  o  I" i((((7||#H-- 	) 	)JC'++ENB??D Z %!K(3.#(#4"["%ej//  n	  I i((((nZnnCLLnnUXYaUbUbnnn	
 	
 	
 r   
chunk_textc                 T    | j                             |||          }d |D             S )Nr:   c                 Z    g | ](}|j         j        |j        |j        |j        |j        d )S ))relation_typetarget_doc_numberarticle_numbercontext
confidence)rf   valuerg   rh   ri   rj   )r@   rels     r   rC   zATaxAdaptiveSplitter._extract_chunk_references.<locals>.<listcomp>{   sP     	
 	
 	
  "%!2!8%(%:"%"4;!n 	
 	
 	
r   )r0   rP   )r1   rc   r5   r6   	relationss        r   rR   z-TaxAdaptiveSplitter._extract_chunk_referencesu   sK     ,>>KO ? 
 
		
 	
 !	
 	
 	
 		
r   reference_listsc                 ^   i }|D ]}|D ]}|                     d          |                     d          |                     d          f}||v r:|                     dd          ||                              dd          k    r|||<   ~|||<   t          |                                          S )Nrf   rg   rh   rj   r   )r?   r   values)r1   rn   
merged_mapref_listrefkeys         r   rU   z%TaxAdaptiveSplitter._merge_references   s     35
' 	* 	*H 
* 
*GGO,,GG/00GG,--
 *$$ww|Q//*S/2E2ElTU2V2VVV*-
3&)JsOO
* J%%''(((r   N)r!   r"   r#   r$   )r3   r4   )r   r   r   r   r2   r   r   r
   rN   r   r   rR   rU   r   r   r   r    r       s<        !!"&; ;; ; 	;
  ; ; ; ;( W\F FF&)F<?FPSF	hF F F FP

,/
BE
	d38n	
 
 
 
")#Dc3h$89)	d38n	) ) ) ) ) )r   r    )loggingdataclassesr   typingr   parent_child_storer   r.   r   r0   r   	getLoggerr   rK   r
   r    r   r   r   <module>rz      s     ! ! ! ! ! !       8 8 8 8 8 8 = = = = = = 6 6 6 6 6 6		8	$	$        ~) ~) ~) ~) ~) ~) ~) ~) ~) ~)r   