
    Xj=                        d dl mZ d dlmZmZmZ d dlmZmZm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ  ee          Z G d
 d          ZdedefdZedk    rd dlmZ  e            Z	  ee          Ze                    dddddd          Z e!                    de             e"                    g dddddd          Z#e!                    de#            e$                    ddd          Z%e!                    de%            e&                                 dS # e&                                 w xY wdS )    )Session)DocumentNotFoundErrorEmbeddingGenerationErrorKnowledgeBaseNotFoundError)DocumentVectorKnowledgeBaseKnowledgeDocument)SplitterTypeTextSplitterService)get_task_manager)get_embedding_factory)get_document_processor)get_vector_store)
get_loggerc                       e Zd ZdefdZ	 	 	 	 	 	 	 	 dd	ed
edededededz  dedz  dedz  dedefdZ		 	 	 	 	 	 	 dde
d
ededededz  dedz  dedz  dedefdZ	 	 	 	 ddededz  dedz  dedz  dedefdZdS )DocumentVectorizationServicedbc                     || _         t                      | _        t                      | _        t                      | _        t                      | _        d S N)	r   r   doc_processorr   text_splitter_servicer   embedding_factoryr   task_manager)selfr   s     T/lsinfo/ai/hellotax_ai/base_platform/app/services/knowledge/vectorization_service.py__init__z%DocumentVectorizationService.__init__   sG    355%8%:%:"!6!8!8,..    	recursive        NTdocument_idchunk_strategy
chunk_sizechunk_overlapwindow_sizemodel_id	tenant_iduser_idcleanup_modelreturnc
                    |t          d          	 | j                            t                                        t          j        |k                                              }
|
st          |          |=|
j        5t          
                    d| d|
j         d           t          |          nC|
j        |k    r8t          
                    d| d|
j         d| d           t          |          t                              d|
j         d| d	           |d
k    rU| j                            |
j        ||t           j        ||
j        |
j        |          }d t'          |          D             }nf|
j        r3| j                            |
j        |||||
j        |
j        d          }n,| j                            |
j        |||||
j        d          }t                              dt3          |           d           | j                            |t3          |                    }|                                 g }d |D             }| j                            || j        |          }t'          |          D ]L\  }}|r4|                    |           | j                             ||dz              ;tC          d|           t                              dt3          |           d           |rZddl"m#} | j                            |                              |j        |k                                              }|r|j$        nd}nd}d }|
j%        ryddl&m'} | j                            |                              |j        |
j%        k                                              }|r$|j(        }t                              d|            tS          | j        ||          }|*                    |||||
j+        pd gt3          |          z  |
j,        r't[          |
j,        .                    d!                    ndgt3          |          z  "          }t                              d#t3          |           d           t'          |          D ]$\  }}t_          ||0                    d$|          |d%         |t3          |          k     rtc          ||                   nd ||0                    d&          |0                    d'd(          |0                    d)          |
j2        |
j        |
j3        |0                    d*          |0                    d+          |
j+        pd |
j,        r't[          |
j,        .                    d!                    nd,          }| j        4                    |           &| j        5                                 t                              d-t3          |           d.           |d
k    r|r	 dd/l6m7} dd0l8m9} |j:        r |            }|;                    |||pd|1          } | 0                    d2          rHt                              d3| 0                    d4           d5| 0                    d6           d7           nKt          
                    d8| 0                    d9                      nt                              d:           n4# tx          $ r'}!t          
                    d;|!            Y d }!~!nd }!~!ww xY wd<|
_=        ||
_>        |d
k    |
_?        |d
k    r|nd |
_@        | j        A                                 | j        B                    |           |	ri	 | j        C                                 t                              d=           n4# tx          $ r'}!t          
                    d>|!            Y d }!~!nd }!~!ww xY wd<|t3          |          t3          |          ||||d?S # tx          $ rK}!t          D                    d@|!            | j        E                    |tc          |!                      d }!~!ww xY w)ANz%user_id is required for vectorizationzTenant mismatch: document  belongs to tenant z:, but platform-level access (tenant_id=None) was requested, but tenant  was requestedz!Starting document vectorization: z (ID: )tax_adaptive)textr$   r%   splitter_typer"   document_titledocument_numberr&   c                 H   g | ]\  }}|d          |                     di           |                     d          |                     d|          |                     dd          |                     d          |                     d          |                     dg           d	S )
r2   metadatachunk_idchunk_index	is_parentFparent_chunk_idchunk_level
references)r2   r7   r8   r9   r:   r;   r<   r=   )get).0ichunks      r   
<listcomp>zCDocumentVectorizationService.vectorize_document.<locals>.<listcomp>K   s        !5 !&f$)IIj"$=$=$)IIj$9$9',yy'B'B%*YY{E%B%B+0995F+G+G',yy'?'?&+iib&A&A	 	  r   )r"   titlesource)	file_pathstrategyr$   r%   r7   )r"   rC   )r2   rF   r$   r%   r7   zDocument chunking completed: z chunksc                     g | ]
}|d          S )r2    )r?   rA   s     r   rB   zCDocumentVectorizationService.vectorize_document.<locals>.<listcomp>p   s    ===U5====r   )textsr   r'      z'Failed to generate embedding for chunk z Embedding generation completed: z vectorsr   )ModelunknowndefaultKnowledgeCategoryz'Document belongs to knowledge base ID: )knowledge_base_idr(   	effectivez%Y%m%d)r"   chunks
embeddings
model_namedoc_status_listissue_date_int_listzVector storage completed: r9   r2   r;   r:   Fr<   r=   r8   )r"   r9   
chunk_text	milvus_idrT   r;   r:   r<   doc_type
doc_numberissuing_authorityr=   r8   
doc_statusissue_date_intz'PostgreSQL metadata storage completed: z records)settings)get_graph_builder)r"   rR   r(   kb_idsuccesszChunk graph built: chunk_countz	 chunks, reference_countz referenceszChunk graph build failed: errorz4Knowledge graph disabled, skipping chunk graph buildz)Failed to build chunk graph (non-fatal): Tz2Local embedding model unloaded after vectorizationz"Failed to unload embedding model: )ra   r"   chunks_countvectors_countrT   rF   r$   r%   zDocument vectorization failed: )F
ValueErrorr   queryr	   filteridfirstr   r(   loggerwarninginforC   r   create_chunks_with_metadatacontentr
   TAX_ADAPTIVErZ   	enumeraterE   r   process_filerD   process_textlenr   create_taskstartr   generate_embeddings_batchappendupdate_taskr   app.models.providerrK   namecategory_idapp.models.knowledge_baserO   rP   r   add_documentsr\   
issue_dateintstrftimer   r>   strrY   r[   addflush
app.configr^    app.services.graph.graph_builderr_   ENABLE_KNOWLEDGE_GRAPHbuild_chunk_graph	Exceptionis_vectorizedvector_modelenable_parent_childr&   commitcomplete_taskunload_local_embedding_modelrd   	fail_task)"r   r"   r#   r$   r%   r&   r'   r(   r)   r*   documentchunks_with_metadatarR   taskrS   chunk_textsbatch_embeddingsr@   	embeddingrK   modelrT   rP   rO   categoryvector_store
vector_idsrA   
doc_vectorr^   r_   graph_buildergraph_resultes"                                     r   vectorize_documentz/DocumentVectorizationService.vectorize_document    s	    ?DEEEv	/00778I8LP[8[\\bbdd   9+K888 %1NN d[  d  dU]Ug  d  d  d   0<<<	 2
 #y00 L  L  LQYQc  L  Lr{  L  L  L   ,K888KK`HN``R]```aaa//'+'A']']!))"/"."; +#+>$,$7 + (^ 	( 	($  %..B$C$C   # +88&0+)"/'2!)"*/  9 
 
 +88!)+)"/-88>RR 9   KKLFLLLMMM$00c&kkJJDJJLLLJ==f===K#5OO!dg  P     !**: ; ; b b9 b%%i000%11+q1uEEEE23`]^3`3`aaaKKT3z??TTTUUU '555555e,,33EH4HIIOOQQ+0?UZZi

&
 $# 
_GGGGGG GMM"344V-0H4HHIIUWW 
  _(0(B%KK ]J[ ] ]^^^++<	  L &33'%%!)!4!C Ds6{{ RCKCV]C+44X>>???\]% f++% 4 
 
J KKNS__NNNOOO%f-- ( (5+ + %		- ; ;$V}45J4G4Gc*Q-000T)$)II.?$@$@#iiU;; %		- 8 8%.'2&.&@$yy66"YYz22'2AkGOGZaH/88BBCCC`a!  
& J''''GMMOOOKKW#f++WWWXXX//4E/T333333RRRRRR6 \(9(9(;(;'4'F'F(3#)&/n1"3	 (G ( ( (++I66 e"KK !Ql6F6F}6U6U  !Q  !Q`l`p`p  rC  aD  aD  !Q  !Q  !Q    #NN+cHXHXY`HaHa+c+cdddd$Z[[[  T T TNN#Rq#R#RSSSSSSSST%)H"$.H!+9^+KH(2@N2R2R;;X\H GNN++K888 MM*GGIIIKK TUUUU  M M MNN#K#K#KLLLLLLLLM  * #F!$Z(*(!.	 	 	  	 	 	LL>1>>???''SVV<<<	so   X` C [6 5` 6
\' \"` "\''A ` 3^< ;` <
_-_(#` (_--'` 
a*Aa%%a*document_idsc	                 *   |t          d          g g t          |          d}	|D ]}
	 |                     |
||||||d          }|	d                             |
|d           ?# t          $ r5}|	d                             |
t          |          d           Y d }~yd }~ww xY w|ri	 | j                                         t          	                    d	           n4# t          $ r'}t          
                    d
|            Y d }~nd }~ww xY w|	S )Nz+user_id is required for batch vectorization)ra   failedtotalF)r"   r#   r$   r%   r'   r(   r)   r*   ra   )r"   resultr   )r"   rd   z8Local embedding model unloaded after batch vectorizationz<Failed to unload embedding model after batch vectorization: )rg   ru   r   ry   r   r   r   r   rl   rn   rm   )r   r   r#   r$   r%   r'   r(   r)   r*   resultsdoc_idr   r   s                r   batch_vectorize_documentsz6DocumentVectorizationService.batch_vectorize_documents   s    ?JKKK B\9J9JKK" 	S 	SFS00 &#1)"/%'#"' 1 	 	 	"))&F*S*STTTT S S S!((#a&&)Q)QRRRRRRRRS 	cc&CCEEEVWWWW c c ca^_aabbbbbbbbcs/   ;A&&
B%0+B  B%+3C 
D)DDrP   c           	      ~   |t          d          	 | j                            t                                        t          j        |k                                              }|st          |          |=|j        5t          
                    d| d|j         d           t          |          nC|j        |k    r8t          
                    d| d|j         d| d           t          |          ddlm} d	 | j                            |                              |j        |k                                              D             }|ra| j                            t                                        t          j                            |                                                    ng }	d
 |	D             }
t                              d|j         dt)          |
                      |                     |
||||          }|S # t,          $ r#}t                              d|             d }~ww xY w)Nz7user_id is required for knowledge base re-vectorizationz Tenant mismatch: knowledge base r-   z), but platform-level access was requestedr.   r/   r   rN   c                     g | ]	}|j         
S rH   rj   )r?   cats     r   rB   zLDocumentVectorizationService.re_vectorize_knowledge_base.<locals>.<listcomp>&  s*          r   c                     g | ]	}|j         
S rH   r   )r?   docs     r   rB   zLDocumentVectorizationService.re_vectorize_knowledge_base.<locals>.<listcomp>3  s    888sCF888r   z*Starting knowledge base re-vectorization: z, document count: )r   r'   r(   r)   r*   z(Knowledge base re-vectorization failed: )rg   r   rh   r   ri   rj   rk   r   r(   rl   rm   
app.modelsrO   rP   allr	   r}   in_rn   r|   ru   r   r   rd   )r   rP   r'   r(   r)   r*   kbrO   category_ids	documentsr   r   r   s                r   re_vectorize_knowledge_basez8DocumentVectorizationService.re_vectorize_knowledge_base  s    ?VWWW,	}--44]5EIZ5Z[[aaccB D01BCCC <+NN Y;L  Y  Yacam  Y  Y  Y   55FGGG	 ,
 ** R7H  R  R]_]i  R  R  yB  R  R  R   11BCCC444444 7==):;;);?PPQQ	  L  /00)599,GGHH  98i888LKKkRWkkX[\hXiXikk   44)!#+ 5  G N 	 	 	LLGAGGHHH	s   G;H 
H<H77H<)r   r   r    r!   NNNT)r   r   r    NNNT)NNNT)__name__
__module____qualname__r   r   r   r   booldictr   listr   r   rH   r   r   r   r      s       /7 / / / / * # $""D DD D 	D
 D D *D :D tD D 
D D D DR * # $""# ## # 	#
 # *# :# t# # 
# # # #P  $ $""6 66 *6 :	6
 t6 6 
6 6 6 6 6 6r   r   r   r+   c                      t          |           S r   )r   )r   s    r   get_vectorization_servicer   D  s    '+++r   __main__)SessionLocal{   r   r   r    rJ   )r"   r#   r$   r%   r'   r)   zVectorization result: )r   |   }   )r   r#   r$   r%   r'   r)   zBatch processing result:    )rP   r'   r)   z(Knowledge base re-vectorization result: N)'sqlalchemy.ormr   app.core.exceptionsr   r   r   r   r   r   r	   ,app.services.knowledge.text_splitter_servicer
   r   1app.services.knowledge.vectorization_task_managerr   3app.services.llm.backends.embedding_backend_factoryr   -app.services.rag.langchain_document_processorr   )app.services.storage.vector_store_factoryr   common_loggingr   r   rl   r   r   app.db.sessionr   r   vectorization_servicer   r   rn   r   batch_resultr   	kb_resultcloserH   r   r   <module>r      s   " " " " " "          H G G G G G G G G G Z Z Z Z Z Z Z Z N N N N N N U U U U U U P P P P P P F F F F F F % % % % % %	H		j j j j j j j jZ	,' ,.J , , , , z++++++ 
B < <R @ @&99& : 
 
 	5V55666,FF(& G 
 
 	>>>???)EE!Q F 
 
	 	JyJJKKK









= s   8B#D1 1E