
    XjG                        d dl Z d dlZd dlmZmZmZmZmZmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z%  e%e&          Z'ddddddddddddZ( e            Z)e)*                    d           ed           ed           ed           ed           ed           eddd !           ed"d d!           ed#           ee           ee           e ed$d%                    fd&ed'e	d(e+dz  d)e+dz  d*e,d+e,d,e+d-e+d.e,d/ed0ed1dfd2            Z-e)*                    d3           ed           ed           ed           ed           ed           eddd !           ed"d d!           ed#           ee           ee           e ed$d%                    fd&ed4e.e	         d(e+dz  d)e+dz  d*e,d+e,d,e+d-e+d.e,d/ed0ed1dfd5            Z/dS )6    N)	APIRouterDependsFileFormHTTPExceptionRequest
UploadFile)Session)get_current_user)require_permission)settings)DocumentUploadErrorTextChunkingErrorUnsupportedFileTypeError)get_translator)get_db)DocumentVersionKnowledgeBaseKnowledgeDocumentModelUser)DocumentCreate)get_minio_service)
get_loggerzapplication/pdfzapplication/mswordzGapplication/vnd.openxmlformats-officedocument.wordprocessingml.documentzapplication/vnd.ms-powerpointzIapplication/vnd.openxmlformats-officedocument.presentationml.presentationz
text/plainztext/markdownzAapplication/vnd.openxmlformats-officedocument.spreadsheetml.sheetztext/csvzapplication/jsonzapplication/xml)pdfdocdocxpptpptxtxtmdxlsxcsvjsonxmlz/documents/upload.draft1i  d   i  )gele   	recursiveknowledge_basescreaterequestfilecategory_idknowledge_base_idstatusenable_chunking
chunk_sizechunk_overlapsplitter_typedbcurrent_user_c                 z  K   ddl m} ddlm}m} ddlm} t          |            |dvrt          dd	          d
}|j	        
                    dd           |j	                                        }|j	        
                    d           ||k    rt          dd	          g d}d|j        v r2|j                            d          d                                         nd}||vr*t          dd| dd                    |           	          |dv }t                               d|j        |           	 |                                 d {V }d|j        v r |j                            d          d         nd} |            }|                    ||          }|st)          |          d|j        v r3|j                            dd          d                                         nd}|t                               d           d }nd|
j         d| dt1          j                     d| }t5                      }t6                              |d          }|                    t<          j        |tA          j!        |          tE          |          |          }|s t                               d| d           d }d|j        v r!|j                            dd          d         n|j        }||tE          |          d k    r|d d          d!z   n|||||d"|rd#nd$|r|nd |r|nd |r|nd tE          |          d%}tG          d@i |} || ||	|
          } |r |r	  |            }!|j$        }"|                                d&k    r|j%        }"n|                                d'k    r|j&        }"n|                                d(k    r|j'        }"n|                                d)k    r|j(        }"n_|                                d*k    r|j)        }"n?|                                d+k    r|j*        }"n|                                d,k    r|j+        }"|!,                    ||||"| j-        | j.        -          }#| j/        0                                }$|#|$d.<   tE          |#          |$d/<   t                               d0| j-        |j        tE          |#          1           	 dd2l1m2}%  |%|	| j-                   n<# tf          $ r/}&t                               d3| j-         d4|&            Y d }&~&nd }&~&ww xY w	 dd5l4m5}'  |'            }(|(6                    |          })|)j7        j8        | _7        |)j9        | _9        |)j:        | _:        |	;                                 |)j7        j8        |$d6<   |)j9        |$d7<   |)j:        |$d8<   t                               d9| j-         d:|)j7        j8                    n<# tf          $ r/}&t                               d;| j-         d4|&            Y d }&~&nd }&~&ww xY w|$S # tx          $ r  tf          $ rB}&t           =                    d<|&            ty          d=t}          |&                     d d }&~&ww xY w	 dd2l1m2}%  |%|	| j-                   n<# tf          $ r/}&t                               d3| j-         d4|&            Y d }&~&nd }&~&ww xY w| S # t          $ r  tf          $ rW}&t           =                    d>|j        t}          |&          ?           t          |j        t}          |&                    d d }&~&ww xY w)ANr   get_file_parser_service)SplitterTypeget_text_splitter_service   )create_documentr,   	charactertokenmarkdowntax_article
tax_clausetax_chaptertax_adaptive  Invalid splitter_type. Must be one of: recursive, character, token, markdown, tax_article, tax_clause, tax_chapter, tax_adaptivestatus_codedetaili      z.File size exceeds maximum allowed size of 50MB)r   r   r   r   r   r    r!   . zFile type .z not allowed. Allowed types: z, r'   trueTrueyesYeszUpload started)filenamesizebinz0knowledge_base_id is None; skipping MinIO uploadtenant_/kb_/application/octet-streambucket_nameobject_name	file_data	file_sizecontent_typez Failed to upload file to MinIO: z, continuing without storager+   ...T	automaticnone)titlecontentsummaryr1   	file_type	file_pathr3   	is_publicsegmentation_moder5   r6   r7   character_countrC   rD   rE   rF   rG   rH   rI   )textr5   r6   r7   document_iddocument_titlechunkschunk_countzUpload completed)doc_idrX   rs   auto_tag_document_on_upload!Auto-tagging failed for document : )TaxDocumentClassifierdoc_type
doc_numberissuing_authorityz	Document z classified as: z,Document classification failed for document zText chunking failed: u   文本分块失败: zUpload failed)rX   error )@"app.services.knowledge.file_parserr=   ,app.services.knowledge.text_splitter_servicer>   r?   	documentsrA   r   r   r0   seektellrX   splitlowerjoinloggerinforead
parse_filer   rsplitwarning	tenant_iduuiduuid4r   _MINIO_CONTENT_TYPESgetupload_filer   MINIO_BUCKETioBytesIOlenr   	RECURSIVE	CHARACTERTOKENMARKDOWNTAX_ARTICLE
TAX_CLAUSETAX_CHAPTERTAX_ADAPTIVEcreate_chunks_with_metadataidrh   __dict__copy#app.services.knowledge.auto_taggingrw   	Exception*app.services.knowledge.document_classifierrz   classifyr{   valuer|   r}   commitr   r~   strr   )*r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r=   r>   r?   rA   MAX_FILE_SIZErc   ALLOWED_EXTENSIONSfile_extenable_chunking_boolcontent_bytesrk   parser_servicetext_contentra   minio_serviceupload_content_typeupload_successrh   document_data
doc_createdocumentsplitter_servicesplitter_type_enumrs   document_dictrw   erz   
classifierclassifications*                                             C/lsinfo/ai/hellotax_ai/base_platform/app/api/v1/knowledge/upload.pyupload_document_filer   )   s	      KJJJJJdddddddd******7 	 	 	  V
 
 
 	
 %MINN1a	  IINN1=  $T
 
 
 	
 LKK7:dm7K7Kt}""3''+11333QSH)))ggg		RdHeHegg
 
 
 	
 +.QQ
KK 4=yKIIIwC"iikk))))))474=4H4HDM'',,R00b	0022%00	JJ 	6*9555?Bdm?S?S4=''Q//399;;;Y^$NNMNNNKK d,0cc6Gcc$*,,ccYacc  .//M"6":":8E_"`"`*66$1'*]33m,,0 7  N " #`{```   #36$-3G3G$$S!,,Q//T]#585F5F5L5L|DSD)E11R^&"$0D!P&(<F**$.BL]].BL]]"<00
 
 $44m44
"?7JLII <	SL <	S;S#<#<#>#> %1%;" &&((K77)5)?&&"((**g55)5);&&"((**j88)5)>&&"((**m;;)5)A&&"((**l::)5)@&&"((**m;;)5)A&&"((**n<<)5)B&)EE%)"/"4 (#+> F   !) 1 6 6 8 8*0h'/26{{m,.x{T]cfgmcncnooo[______//HK@@@@  [ [ [NN#Yx{#Y#YVW#Y#YZZZZZZZZ[``````!6!6!8!8J%/%8%8%F%FN(6(?(EH%*8*CH'1?1QH.IIKKK0>0G0MM*-2@2KM,/9G9YM"56KK`HK``AXA^``    !   NNYx{YYVWYY        %$$    S S S9a99:::'(Gs1vv(G(GHHdRS	SWWWWWW''HK8888 	S 	S 	SNNQx{QQaQQRRRRRRRR	S    C C C_t}CFFKKK!$-Q88dBCs   8H#[ E:X S/ .X /
T(9%T#X #T((X ,B4W!  X !
X+%XX XX Y53=Y00Y55[ 9Z [ 
[
%[ [ [

[ \:#A\55\:z/documents/batch-uploadfilesc           
      \  K   ddl m} t          |           }|dvrt          dd          t	          |          dk    rt          dd          |d	v }t
                              t	          |          |
                              d           |rddlm	} |	
                    |                              |j        |k                                              }|st          dd          |	
                    t                                        t          j        |j        k                                              }|st          dd          g } |            }t!                      }|D ]}	 |                                 d {V }d|j        v r |j                            d          d         nd}|                    ||          }|s4|                    |j        d|                    d|          d           d|j        v r!|j                            dd          d         n|j        }d }|r|r||j        nd }n|
j        |
j        }|St
                              |j                                      d           |                    |j        ddd           ;d|j        v r3|j                            dd          d                                         nd}|&t
                              d|j         d           d }nd|
j         d| dt7          j                     d| }t:                              |d           }|                    t@          j!        |tE          j#        |          t	          |          |!          }|st
                              d"|            d }tI          dFi d#|d$|d%t	          |          d&k    r|d d&         d'z   n|d(|d)|d*|d+|d,d-d.|rd/nd0d1|r|nd d2|r|nd d3|r|nd d4t	          |          d5|
j        d6|d7dd8d9d:d}|	%                    |           |	&                                 tO          |j        d||d;|
j        <          } |	%                    |            |	(                                 |	)                    |           	 dd=l*m+}!  |!|	|j                   n<# tX          $ r/}"t
                              d>|j         d?|"            Y d }"~"nd }"~"ww xY w|rdd@l-m.}# d }$|r}|	
                    |                              |j        |k                                              }|r7|j        r/|	
                    t                                        t          j        |j        k                                              }|r|j/        r|j/        0                                r`|	
                    tb                                        tb          j        te          |j/                  k                                              }%nR|	
                    tb                                        tb          j/        |j/        k                                              }%|%r|%j        }$ |#            }&|&3                    |j        |$|||A           |                    |j        d-|j        d|rd9nd0|j4        dB           # tX          $ rl}"t
                              |j                  5                    dC|"            |                    |j        dtm          |"          d           Y d }"~"d }"~"ww xY wto          dD |D                       }'t	          |          |'z
  }(t	          |          |'|(|dES )GNr   r<   rB   rJ   rK   rL   2   z)Maximum 50 files allowed per batch uploadrS   )
file_countr4   zBatch upload started)KnowledgeCategoryi  zCategory not foundzKnowledge base not foundrP   rQ   rR   Fz knowledge.cannot_parse_file_type)rk   )rX   successr~   r@   )rX   zCannot determine tenant_idrZ   zknowledge_base_id is None for z; skipping MinIO uploadr[   r\   r]   r^   r_   z&Failed to upload batch file to MinIO: rh   ri   rj   r+   re   r1   rk   rl   r3   rm   Trn   rf   rg   r5   r6   r7   ro   	author_idr   is_vectorizedvectorization_statuspendingvectorization_progressu   初始版本)rq   version_numberrh   ri   change_summary	editor_idrv   rx   ry   )get_async_vectorization_service)rq   model_idr5   r6   r7   )rX   r   rq   r   r   rh   zBatch upload file failed: c              3   *   K   | ]}|d          
dV  dS )r   r@   Nr   ).0rs     r   	<genexpr>z)batch_upload_documents.<locals>.<genexpr>  s+      ;;aa	l;;;;;;;    )totalr   failedresultsr   )8r   r=   r   r   r   r   bindr   app.models.knowledge_baser   queryfilterr   firstr   r2   r   r   rX   r   r   appendtr   r   r   r   r   r   r   r   r   r   r   r   r   r   addflushr   r   refreshr   rw   r   ,app.services.rag.async_vectorization_servicer   codeisdigitr   intvectorize_document_asyncrh   r~   r   sum))r/   r   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r=   r   r   r   categorykbr   r   batch_minio_servicer0   r   rk   r   rh   r   batch_file_extbatch_object_namebatch_upload_content_typebatch_upload_successdb_documentversionrw   r   r   vector_model_idvector_modelasync_servicesuccess_count
fail_counts)                                            r   batch_upload_documentsr      sS	      KJJJJJwA 	 	 	  V
 
 
 	
 5zzB4_````*.QQ
KK3u::7KKLLQQRhiii T??????88-..556G6Jk6YZZ``bb 	NC8LMMMMXXm$$++M,<@Z,Z[[aacc 	TC8RSSSSG,,..N+-- K[ K[J	["&))++------M8;t}8L8L++C0044RTI)44]INNL $(M#(!"%GS\!]!]    7:dm7K7KDM((a0033QUQ^EI 3x 3,.NBLL		'3(2	 T]33;;<XYYY$(M#(!=    <?4=<P<P$$S!,,R066888V[  !([T][[[   %)!!$}l.D$}$}J[$}$}^b^h^j^j$}$}m{$}$}!,@,D,D"$>- -) (;'F'F ( 5 1 j77!-00!: (G ( ($ , -NN#_L]#_#_```(,%+   e$ 7:,6G6G#6M6MTcT*U22S_ (K	
 $) ,+ v $ 2F"Q++6 *>G::4 0DMmm 0DMmm !$L 1 1 1 '// $)  $e!" &/Y#$ ()q%K( FF;HHJJJ%'N $-&/  G FF7OOOIIKKKJJ{###Z[[[[[[++B???? Z Z ZX;>XXUVXXYYYYYYYYZ# "      #' B!233 1 4 CDD 
   BH$> BHH]33#VM$48R$RSS"UWW 
  B"' B!w00 e$&HHUOO$:$:58s27||;S$T$T$Z$Z$\$\ !- 02xx/E/EejTVT[F[/\/\/b/b/d/d+ B2>/ ? ? A A66 +,)"/"/ 7    NN $##.>%*9M,YIISY(. 	 	 	 	  	[ 	[ 	[KKK//556VST6V6VWWWNN%RUVWRXRXYYZZZZZZZZ	[ ;;7;;;;;MW-JW 	  sS   4B[6B$[6+H[6=S[6
T%T	[6	TG&[66
], A!]''],)0r   r   fastapir   r   r   r   r   r   r	   sqlalchemy.ormr
   app.api.depsr   app.api.permissionsr   
app.configr   app.core.exceptionsr   r   r   app.core.i18nr   app.db.sessionr   
app.modelsr   r   r   r   r   app.schemas.knowledge_documentr   app.services.storage.minior   common_loggingr   __name__r   r   routerpostr   r   r   listr   r   r   r   <module>r     s   				  V V V V V V V V V V V V V V V V V V " " " " " " ) ) ) ) ) ) 2 2 2 2 2 2       ` ` ` ` ` ` ` ` ` ` ( ( ( ( ( ( ! ! ! ! ! !              : 9 9 9 9 9 8 8 8 8 8 8 % % % % % %	H		U*W
O   
  !! tCyy"d4jj$(DJJ$w--499d4CD111caD111k**'&// !122g(():HEEFFjC jCjC
jC tjC Tz	jC
 jC jC jC jC jC 	jC jC jC jC jC "!jCZ &'' #d3ii"d4jj$(DJJ$w--499d4CD111caD111k**'&// !122g(():HEEFFB BB
B tB Tz	B
 B B B B B 	B B B B B ('B B Br   