
    XjW$                     n    d dl Z d dlZd dlmZ d dlmZ  ee          Z G d d          Zda	defdZ
dS )    N)FileProcessingError)
get_loggerc            
          e Zd Zeddededefd            Zeddededefd            Zeddededefd	            Zeddededefd            Z	eddededefd            Z
eddededefd            Zeddedededefd            ZdS )FileParserServicefile.txtcontentfilenamereturnc                 ^   	 dD ])}	 |                      |          c S # t          $ r Y &w xY wt                              d           t	          |d          # t          $ r  t
          $ r@}t                              d|            t	          |t          |                    d d }~ww xY w)N)utf-8gbkgb2312latin-1z5Unable to decode TXT file with any supported encodingu-   无法使用任何支持的编码解码文件zFailed to parse TXT file: decodeUnicodeDecodeErrorloggererrorr   	Exceptionstrr   r	   encodinges       J/lsinfo/ai/hellotax_ai/base_platform/app/services/knowledge/file_parser.py	parse_txtzFileParserService.parse_txt   s    	BA  ">>(33333)   HLLPQQQ%h0_```" 	 	 	 	B 	B 	BLL9a99:::%hA77TA	B2   A A 
+A +-A B,,;B''B,file.pdfc                 (   	 ddl m} t          j        dd          5 }|                    |            |j        }d d d            n# 1 swxY w Y   	  ||          }|                                }d |D             }d                    |          }t          	                    dt          |           d	t          |           d
           |t          j                            |          rt          j        |           S S # t          j                            |          rt          j        |           w w xY w# t          $ r  t           $ r@}	t                              d|	            t          |t%          |	                    d d }	~	ww xY w)Nr   )PyPDFLoaderFz.pdfdeletesuffixc                 N    g | ]"}|j                                         |j         #S  page_contentstrip.0docs     r   
<listcomp>z/FileParserService.parse_pdf.<locals>.<listcomp>*   /    ```3sGWG]G]G_G_`c.```    

zSuccessfully parsed PDF, z pages, extracted  characterszFailed to parse PDF file: )$langchain_community.document_loadersr   tempfileNamedTemporaryFilewritenameloadjoinr   infolenospathexistsunlinkr   r   r   r   )
r   r	   r   tmp_filetmp_file_pathloader	documents
text_partsresultr   s
             r   	parse_pdfzFileParserService.parse_pdf   s   	BHHHHHH,E&III .Xw''' (. . . . . . . . . . . . . . .-$]33"KKMM	``)```
Z00jIjjRUV\R]R]jjj   7>>-00 -Im,,,,-27>>-00 -Im,,,,-" 	 	 	 	B 	B 	BLL9a99:::%hA77TA	BsR   D= AD= AD= AD= A<D 3D= 6D::D= =F;FF	file.docxc                 r   	 ddl m} dd l}|                     ||                     }d |j        D             }g }|j        D ]G}|j        D ]=}d                    d |j        D                       }	|	r|	                    |	           >H||z   }
d                    |
          }|
                                s-t                              d|            t          |d          |                    d	          sd
|d d         v r-t                              d|            t          |d          t                              dt#          |           d           |S # t          $ r  t$          $ r@}t                              d|            t          |t'          |                    d d }~ww xY w)Nr   )BytesIOc                 N    g | ]"}|j                                         |j         #S r$   textr'   )r)   paras     r   r+   z0FileParserService.parse_docx.<locals>.<listcomp>A   s+    TTT$)//BSBST$)TTTr-   z | c              3   z   K   | ]6}|j                                         |j                                         V  7d S NrH   )r)   cells     r   	<genexpr>z/FileParserService.parse_docx.<locals>.<genexpr>E   sS       * *.249??CTCT*	))* * * * * *r-   r.   zExtracted empty content from u   文档内容为空u   ÐÏà¡±á d   zExtracted binary content from u<   提取的内容包含二进制数据，文件可能已损坏z.Successfully parsed .docx document, extracted r/   z Failed to parse .docx document: )iorF   docxDocument
paragraphstablesrowsr6   cellsappendr'   r   warningr   
startswithr   r7   r8   r   r   )r   r	   rF   rR   r*   rT   tables_texttablerowrow_textall_textrB   r   s                r   
parse_docxzFileParserService.parse_docx9   s   	B""""""KKK-- 0 011CTTTTTJK 5 5 : 5 5C$zz * *69i* * *    H   5#**84445 "K/H[[**F<<>> JIxIIJJJ)(4HIII  00 tFfTcTl4J4JHhHHIII)(4rsssKKaVaaabbbM" 	 	 	 	B 	B 	BLL?A??@@@%hA77TA	Bs   EE" "F66;F11F6file.docc           	         dd l }t          j        dd          5 }|                    |            |j        }d d d            n# 1 swxY w Y   	 	 |                    d|gddd          }|j        dk    r|j                                        rz|j                                        }t          
                    d	t          |           d
           |t          j                            |          rt          j        |           S S t                              d|j                    n4# t$          $ r'}t                              d|            Y d }~nd }~ww xY w	 ddlm} t          
                    d            ||          }	|	                                }
d |
D             }d                    |          }|rat          
                    dt          |           d
           |t          j                            |          rt          j        |           S S n4# t$          $ r'}t                              d|            Y d }~nd }~ww xY w	 t          
                    d           dD ]}	 |                     |          }|rt          |                                          dk    rft          
                    d| dt          |           d
           |c t          j                            |          rt          j        |           S S # t0          t2          f$ r Y w xY wn4# t$          $ r'}t                              d|            Y d }~nd }~ww xY wt                              d           t7          |d          # t          j                            |          rt          j        |           w w xY w)Nr   Fz.docr    antiwordT   )capture_outputrI   timeoutz2antiword successfully parsed .doc file, extracted r/   zantiword parsing failed: zantiword execution failed: )Docx2txtLoaderz#Attempting to use Docx2txtLoader...c                 N    g | ]"}|j                                         |j         #S r$   r%   r(   s     r   r+   z/FileParserService.parse_doc.<locals>.<listcomp>v   r,   r-   r.   z3Docx2txtLoader successfully parsed file, extracted zDocx2txtLoader also failed: z#Attempting to read as plain text...)r   r   r   r   cp1252
   zSuccessfully read file with z encoding, extracted z Plain text reading also failed: zAll .doc parsing methods failed. Suggestions: 1) Verify file is standard Word format; 2) Try saving file as .docx format; 3) Open and re-save with another text editoruJ   所有解析方法均失败，请验证文件格式或转换为.docx格式)
subprocessr1   r2   r3   r4   run
returncodestdoutr'   r   r7   r8   r9   r:   r;   r<   rY   stderrr   r0   rg   r5   r6   r   r   AttributeErrorr   r   )r   r	   rk   r=   r>   rB   rI   r   rg   r?   r@   rA   
docx_errorr   
text_errors                  r   	parse_doczFileParserService.parse_docZ   s   (fEEE 	*NN7###$MM	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	*4	)B#/4Y[ (   $))fm.A.A.C.C)!=..00DKKcSQUYYccc    R w~~m,, )	-(((()O NN#Nv}#N#NOOOO B B B@Q@@AAAAAAAABLOOOOOOABBB'66"KKMM	``)```
Z00 "KKfcRXkkfff   "0 w~~m,, )	-(((()9"
  L L LJjJJKKKKKKKKLPABBB O 	! 	!H!&~~h77 (C

$5$5$:$:"KK tx t t^abf^g^g t t t   $(KK w~~m,, )	-(((() /? ! ! ! !	!  P P PN*NNOOOOOOOOPLL y   &f   w~~m,, )	-(((()s   AAABD- 
"D- ,N -
E7EN EN "BH& %N &
I0IN IN L5 :A+L%L5 L5 L1.L5 0L11L5 4N 5
M&?M!N !M&&-N 6O		file.pptxc                    	 ddl m} t          j        dd          5 }|                    |            |j        }d d d            n# 1 swxY w Y   	  ||          }|                                }d |D             }d                    |          }t          	                    dt          |           d	           |t          j                            |          rt          j        |           S S # t          j                            |          rt          j        |           w w xY w# t          $ r  t           $ r@}	t                              d
|	            t          |t%          |	                    d d }	~	ww xY w)Nr   )UnstructuredPowerPointLoaderFz.pptxr    c                 N    g | ]"}|j                                         |j         #S r$   r%   r(   s     r   r+   z0FileParserService.parse_pptx.<locals>.<listcomp>   r,   r-   r.   z#Successfully parsed PPT, extracted r/   zFailed to parse PPT document: )r0   rv   r1   r2   r3   r4   r5   r6   r   r7   r8   r9   r:   r;   r<   r   r   r   r   )
r   r	   rv   r=   r>   r?   r@   rA   rB   r   s
             r   
parse_pptxzFileParserService.parse_pptx   s   	BYYYYYY ,E'JJJ .hw''' (. . . . . . . . . . . . . . .	-55mDD"KKMM	``)```
Z00Z#f++ZZZ[[[7>>-00 -Im,,,,-27>>-00 -Im,,,,-" 	 	 	 	B 	B 	BLL=!==>>>%hA77TA	BsR   D- AD- AD- AD- A,C4 ?3D- 46D**D- -F;E<<Ffile.mdc                 ^   	 dD ])}	 |                      |          c S # t          $ r Y &w xY wt                              d           t	          |d          # t          $ r  t
          $ r@}t                              d|            t	          |t          |                    d d }~ww xY w)N)r   r   r   zUnable to decode Markdown fileu5   无法使用任何支持的编码解码Markdown文件zFailed to parse Markdown file: r   r   s       r   parse_mdzFileParserService.parse_md   s    	B6  ">>(33333)   HLL9:::%h0ghhh" 	 	 	 	B 	B 	BLL>1>>???%hA77TA	Br   file	file_typec           
         |                                                     d          }| j        | j        | j        | j        | j        | j        | j        | j        | j        d	}|                    |          }|r" |||          }|	                    dd          S t                              d|            t          |d|           )N.)	txtrI   pdfr*   rR   pptpptxmdmarkdownrO    zUnsupported file type: u   不支持的文件类型: )lowerr'   r   rC   rs   r`   rx   r{   getreplacer   rY   r   )clsr   r}   r	   parsersparserrI   s          r   
parse_filezFileParserService.parse_file   s    OO%%++C00	=M==N>N,

 

 Y'' 	Z6'8,,D<<+++NN@Y@@AAA%h0XY0X0XYYYr-   N)r   )r   )rD   )ra   )rt   )ry   )r|   )__name__
__module____qualname__staticmethodbytesr   r   rC   r`   rs   rx   r{   classmethodr   r$   r-   r   r   r      s       B B5 BC B B B B \B B B5 BC B B B B \B2 B BE BS B3 B B B \B@ :) :)5 :)C :) :) :) :) \:)x B BE BS B3 B B B \B0 B B% B3 Bs B B B \B Z Z Z3 Z# ZSV Z Z Z [Z Z Zr-   r   r
   c                  :    t           t                      a t           S rL   )_file_parser_servicer   r$   r-   r   get_file_parser_servicer      s    #022r-   )r9   r1   app.core.exceptionsr   common_loggingr   r   r   r   r   r   r$   r-   r   <module>r      s    				  3 3 3 3 3 3 % % % % % %	H		
GZ GZ GZ GZ GZ GZ GZ GZT   !2            r-   