o
    Ui@                     @   sb  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d d	lmZmZmZ d d
lmZmZmZ eeZG dd dZdedefddZedkrd dlmZ e ZzCeeZ e j!dddddddZ"e#de"  e j$g dddddddZ%e#de%  e j&ddddZ'e#de'  W e(  dS e(  w dS )     )Session)OptionalN)get_document_processor)get_embedding_factory)get_vector_store)get_task_manager)TextSplitterServiceSplitterType)KnowledgeDocumentKnowledgeBaseDocumentVector)DocumentNotFoundErrorKnowledgeBaseNotFoundErrorEmbeddingGenerationErrorc                   @   s   e Zd ZdefddZ									dd
edededededee dee dee dede	fddZ
								ddededededee dee dee dede	fddZ					ddedee dee dee dede	fddZdS )DocumentVectorizationServicedbc                 C   s*   || _ t | _t | _t | _t | _d S N)	r   r   doc_processorr   text_splitter_servicer   embedding_factoryr   task_manager)selfr    r   T/lsinfo/ai/hellotax_ai/base_platform/app/services/knowledge/vectorization_service.py__init__   s
   z%DocumentVectorizationService.__init__	recursive        NTdocument_idchunk_strategy
chunk_sizechunk_overlapwindow_sizemodel_id	tenant_iduser_idcleanup_modelreturnc
           "      C   s  |d u rt dz| jttj|k }
|
st||d u r9|
jd ur8t	
d| d|
j d t|n|
j|krRt	
d| d|
j d| d t|t	d|
j d| d	 |d
kr| jj|
j||tj||
j|
j|d}dd t|D }n&|
jr| jj|
j|||||
j|
jdd}n| jj|
j|||||
jdd}t	dt| d | j|t|}|  g }dd |D }| jj|| j|d}t|D ]\}}|r|| | j ||d  qt!d| t	dt| d |rddl"m#} | j||j|k }|r|j$nd}nd}d }|
j%rFddl&m'} | j||j|
j%k }|rF|j(}t	d|  t)| j||d }|j*|||||
j+pYd!gt| |
j,rjt-|
j,.d"ndgt| d#}t	d$t| d t|D ]T\}}t/||0d%||d& |t|k rt1|| nd ||0d'|0d(d)|0d*|
j2|
j|
j3|0d+|0d,|
j+pd!|
j,rt-|
j,.d"ndd-}| j4| q| j5  t	d.t| d/ |d
krV|rVzKdd0l6m7} dd1l8m9} |j:r4| }|j;|||pd|d2} | 0d3r(t	d4| 0d5 d6| 0d7 d8 nt	
d9| 0d:  nt	d; W n t<yU }! zt	
d<|!  W Y d }!~!nd }!~!ww d=|
_=||
_>|d
k|
_?|d
krh|nd |
_@| jA  | jB| |	rz| jC  t	d> W n t<y }! zt	
d?|!  W Y d }!~!nd }!~!ww d=|t|t|||||d@W S  t<y }! zt	DdA|!  | jE|t1|!  d }!~!ww )BNz%user_id is required for vectorizationzTenant mismatch: document  belongs to tenant z:, but platform-level access (tenant_id=None) was requested, but tenant  was requestedz!Starting document vectorization: z (ID: )tax_adaptive)textr!   r"   splitter_typer   document_titledocument_numberr#   c                 S   s\   g | ]*\}}|d  | di | d| d|| dd| d| d| dg d	qS )
r.   metadatachunk_idchunk_index	is_parentFparent_chunk_idchunk_level
references)r.   r2   r3   r4   r5   r6   r7   r8   )get).0ichunkr   r   r   
<listcomp>X   s    



zCDocumentVectorizationService.vectorize_document.<locals>.<listcomp>)r   titlesource)	file_pathstrategyr!   r"   r2   )r   r>   )r.   rA   r!   r"   r2   zDocument chunking completed: z chunksc                 S   s   g | ]}|d  qS )r.   r   )r:   r<   r   r   r   r=      s    )textsr   r$      z'Failed to generate embedding for chunk z Embedding generation completed: z vectorsr   )ModelunknowndefaultKnowledgeCategoryz'Document belongs to knowledge base ID: )knowledge_base_idr%   	effectivez%Y%m%d)r   chunks
embeddings
model_namedoc_status_listissue_date_int_listzVector storage completed: r4   r.   r6   r5   Fr7   r8   r3   )r   r4   
chunk_text	milvus_idrM   r6   r5   r7   doc_type
doc_numberissuing_authorityr8   r3   
doc_statusissue_date_intz'PostgreSQL metadata storage completed: z records)get_graph_builder)settings)r   rK   r%   kb_idsuccesszChunk graph built: chunk_countz	 chunks, reference_countz referenceszChunk graph build failed: errorz4Knowledge graph disabled, skipping chunk graph buildz)Failed to build chunk graph (non-fatal): Tz2Local embedding model unloaded after vectorizationz"Failed to unload embedding model: )rZ   r   chunks_countvectors_countrM   rA   r!   r"   zDocument vectorization failed: )F
ValueErrorr   queryr
   filteridfirstr   r%   loggerwarninginfor>   r   create_chunks_with_metadatacontentr	   TAX_ADAPTIVErS   	enumerater@   r   process_filer?   process_textlenr   create_taskstartr   generate_embeddings_batchappendupdate_taskr   app.models.providerrD   namecategory_idapp.models.knowledge_baserH   rI   r   add_documentsrU   
issue_dateintstrftimer   r9   strrR   rT   addflush app.services.graph.graph_builderrW   
app.configrX   ENABLE_KNOWLEDGE_GRAPHbuild_chunk_graph	Exceptionis_vectorizedvector_modelenable_parent_childr#   commitcomplete_taskunload_local_embedding_modelr]   	fail_task)"r   r   r    r!   r"   r#   r$   r%   r&   r'   documentchunks_with_metadatarK   taskrL   chunk_textsbatch_embeddingsr;   	embeddingrD   modelrM   rI   rH   categoryvector_store
vector_idsr<   
doc_vectorrW   rX   graph_buildergraph_resulter   r   r   vectorize_document   sn  











z/DocumentVectorizationService.vectorize_documentdocument_idsc	                 C   s   |d u rt dg g t|d}	|D ]9}
z| j|
||||||dd}|	d |
|d W q tyK } z|	d |
t|d W Y d }~qd }~ww |rwz| j  t	d	 W |	S  tyv } zt
d
|  W Y d }~|	S d }~ww |	S )Nz+user_id is required for batch vectorization)rZ   failedtotalF)r   r    r!   r"   r$   r%   r&   r'   rZ   )r   resultr   )r   r]   z8Local embedding model unloaded after batch vectorizationz<Failed to unload embedding model after batch vectorization: )r`   rn   r   rr   r   r|   r   r   re   rg   rf   )r   r   r    r!   r"   r$   r%   r&   r'   resultsdoc_idr   r   r   r   r   batch_vectorize_documents  sP   

z6DocumentVectorizationService.batch_vectorize_documentsrI   c              
   C   sh  |d u rt dz| jttj|k }|st||d u r8|jd ur7t	
d| d|j d t|n|j|krQt	
d| d|j d| d t|ddlm} d	d
 | j||j|k D }|r{| jttj| ng }	dd
 |	D }
t	d|j dt|
  | j|
||||d}|W S  ty } z	t	d|   d }~ww )Nz7user_id is required for knowledge base re-vectorizationz Tenant mismatch: knowledge base r)   z), but platform-level access was requestedr*   r+   r   rG   c                 S      g | ]}|j qS r   rc   )r:   catr   r   r   r=   t      zLDocumentVectorizationService.re_vectorize_knowledge_base.<locals>.<listcomp>c                 S   r   r   r   )r:   docr   r   r   r=   |  r   z*Starting knowledge base re-vectorization: z, document count: )r   r$   r%   r&   r'   z(Knowledge base re-vectorization failed: )r`   r   ra   r   rb   rc   rd   r   r%   re   rf   
app.modelsrH   rI   allr
   rv   in_rg   ru   rn   r   r   r]   )r   rI   r$   r%   r&   r'   kbrH   category_ids	documentsr   r   r   r   r   r   re_vectorize_knowledge_baseL  sf   



z8DocumentVectorizationService.re_vectorize_knowledge_base)r   r   r   r   NNNT)r   r   r   NNNT)NNNT)__name__
__module____qualname__r   r   rz   r|   r   booldictr   listr   r   r   r   r   r   r      s    
	

  	

8r   r   r(   c                 C   s   t | S r   )r   )r   r   r   r   get_vectorization_service  s   r   __main__)SessionLocal{   r   r   r   rC   )r   r    r!   r"   r$   r&   zVectorization result: )r   |   }   )r   r    r!   r"   r$   r&   zBatch processing result:    )rI   r$   r&   z(Knowledge base re-vectorization result: ))sqlalchemy.ormr   typingr   logging-app.services.rag.langchain_document_processorr   3app.services.llm.backends.embedding_backend_factoryr   )app.services.storage.vector_store_factoryr   1app.services.knowledge.vectorization_task_managerr   ,app.services.knowledge.text_splitter_servicer   r	   r   r
   r   r   app.core.exceptionsr   r   r   	getLoggerr   re   r   r   app.db.sessionr   r   vectorization_servicer   r   rg   r   batch_resultr   	kb_resultcloser   r   r   r   <module>   s\   
  