o
    "iy-                     @   sj   d Z ddlZddlmZ ddlZddlZddlZddlmZ e	e
ZG dd dZdadefdd	ZdS )
u_   
文件解析服务
使用 LangChain 的文档加载器支持多种文件格式的文本提取
    N)Optional)FileProcessingErrorc                
   @   s   e Zd ZdZeddededefddZeddededefd	d
ZeddededefddZ	eddededefddZ
ed dededefddZed!dededefddZed"dedededefddZdS )#FileParserServiceu8   文件解析服务类 - 使用 LangChain 文档加载器file.txtcontentfilenamereturnc              
   C      z!dD ]}z	|  |W   W S  ty   Y qw td t|d ty)     tyC } ztd|  t|t|d}~ww )uc   解析TXT文件

        Raises:
            FileProcessingError: 如果无法解码文件
        )utf-8gbkgb2312latin-1z5Unable to decode TXT file with any supported encodingu-   无法使用任何支持的编码解码文件zFailed to parse TXT file: NdecodeUnicodeDecodeErrorloggererrorr   	Exceptionstrr   r   encodinge r   J/lsinfo/ai/hellotax_ai/base_platform/app/services/knowledge/file_parser.py	parse_txt       

zFileParserService.parse_txtfile.pdfc           
   
   C   s  zhddl m} tjddd}||  |j}W d   n1 s!w   Y  z5||}| }dd |D }d	|}t	d
t
| dt
| d |W tj|rZt| W S W S tj|rht| w w  typ     ty }	 ztd|	  t|t|	d}	~	ww )u   解析PDF文件 - 使用 LangChain PyPDFLoader

        Raises:
            FileProcessingError: 如果无法解析PDF文件
        r   )PyPDFLoaderFz.pdfdeletesuffixNc                 S      g | ]
}|j  r|j qS r   page_contentstrip.0docr   r   r   
<listcomp>=       z/FileParserService.parse_pdf.<locals>.<listcomp>

zSuccessfully parsed PDF, z pages, extracted  characterszFailed to parse PDF file: )$langchain_community.document_loadersr   tempfileNamedTemporaryFilewritenameloadjoinr   infolenospathexistsunlinkr   r   r   r   )
r   r   r   tmp_filetmp_file_pathloader	documents
text_partsresultr   r   r   r   	parse_pdf(   s2   

 zFileParserService.parse_pdf	file.docxc              
   C   s<  z{ddl }ddlm} ||| }dd |jD }g }|jD ]}|jD ]}ddd |jD }	|	r8|	|	 q$q|| }
d	|
}|
 sTtd
|  t|d|dsad|dd v rntd|  t|dtdt| d |W S  ty     ty } ztd|  t|t|d}~ww )u   解析Word文档 (.docx) - 使用 python-docx 库

        Raises:
            FileProcessingError: 如果无法解析DOCX文件
        r   N)BytesIOc                 S   r!   r   textr$   )r&   parar   r   r   r(   \   r)   z0FileParserService.parse_docx.<locals>.<listcomp>z | c                 s   s$    | ]}|j  r|j  V  qd S )NrB   )r&   cellr   r   r   	<genexpr>b   s   " z/FileParserService.parse_docx.<locals>.<genexpr>r*   zExtracted empty content from u   文档内容为空u   ÐÏà¡±á d   zExtracted binary content from u<   提取的内容包含二进制数据，文件可能已损坏z.Successfully parsed .docx document, extracted r+   z Failed to parse .docx document: )docxiorA   Document
paragraphstablesrowsr2   cellsappendr$   r   warningr   
startswithr   r3   r4   r   r   )r   r   rI   rA   r'   rL   tables_texttablerowrow_textall_textr>   r   r   r   r   
parse_docxM   s<   





zFileParserService.parse_docxfile.docc              
   C   s  ddl }tjddd}||  |j}W d   n1 sw   Y  z'z@|jd|gdddd	}|jdkr[|j r[|j }t	
d
t| d |W W tj|rZt| S S t	d|j  W n ty } zt	d|  W Y d}~nd}~ww z>ddlm} t	
d ||}	|	 }
dd |
D }d|}|rt	
dt| d |W W tj|rt| S S W n ty } zt	d|  W Y d}~nd}~ww zMt	
d dD ]C}z5| |}|rt| dkrt	
d| dt| d |W   W W tj|rt| S S W q ttfy$   Y qw W n tyA } zt	d|  W Y d}~nd}~ww t	d t|dtj|rYt| w w )u   解析旧版Word文档 (.doc) - 优先使用 antiword

        Raises:
            FileProcessingError: 如果无法解析DOC文件
        r   NFz.docr   antiwordT   )capture_outputrC   timeoutz2antiword successfully parsed .doc file, extracted r+   zantiword parsing failed: zantiword execution failed: )Docx2txtLoaderz#Attempting to use Docx2txtLoader...c                 S   r!   r   r"   r%   r   r   r   r(      r)   z/FileParserService.parse_doc.<locals>.<listcomp>r*   z3Docx2txtLoader successfully parsed file, extracted zDocx2txtLoader also failed: z#Attempting to read as plain text...)r
   r   r   r   cp1252
   zSuccessfully read file with z encoding, extracted z Plain text reading also failed: zAll .doc parsing methods failed. Suggestions: 1) Verify file is standard Word format; 2) Try saving file as .docx format; 3) Open and re-save with another text editoruJ   所有解析方法均失败，请验证文件格式或转换为.docx格式)
subprocessr-   r.   r/   r0   run
returncodestdoutr$   r   r3   r4   r5   r6   r7   r8   rQ   stderrr   r,   r^   r1   r2   r   r   AttributeErrorr   r   )r   r   ra   r9   r:   r>   rC   r   r^   r;   r<   r=   
docx_errorr   
text_errorr   r   r   	parse_doc}   s   

)





zFileParserService.parse_doc	file.pptxc           
   
   C   s  zcddl m} tjddd}||  |j}W d   n1 s!w   Y  z0||}| }dd |D }d	|}t	d
t
| d |W tj|rUt| W S W S tj|rct| w w  tyk     ty }	 ztd|	  t|t|	d}	~	ww )u   解析PowerPoint文档 - 使用 LangChain UnstructuredPowerPointLoader

        Raises:
            FileProcessingError: 如果无法解析PPTX文件
        r   )UnstructuredPowerPointLoaderFz.pptxr   Nc                 S   r!   r   r"   r%   r   r   r   r(      r)   z0FileParserService.parse_pptx.<locals>.<listcomp>r*   z#Successfully parsed PPT, extracted r+   zFailed to parse PPT document: )r,   rk   r-   r.   r/   r0   r1   r2   r   r3   r4   r5   r6   r7   r8   r   r   r   r   )
r   r   rk   r9   r:   r;   r<   r=   r>   r   r   r   r   
parse_pptx   s2   

zFileParserService.parse_pptxfile.mdc              
   C   r	   )up   解析Markdown文件

        Raises:
            FileProcessingError: 如果无法解析Markdown文件
        )r
   r   r   zUnable to decode Markdown fileu5   无法使用任何支持的编码解码Markdown文件zFailed to parse Markdown file: Nr   r   r   r   r   parse_md   r   zFileParserService.parse_mdfile	file_typec              
   C   s|   |  d}| j| j| j| j| j| j| j| j| jd	}||}|r.|||}|	ddS t
d|  t|d| )u  
        根据文件类型解析文件内容

        Args:
            content: 文件二进制内容
            file_type: 文件类型（扩展名，如 'pdf', 'docx'）
            filename: 文件名（用于错误消息）

        Returns:
            提取的文本内容

        Raises:
            FileProcessingError: 如果文件类型不支持或解析失败
        .)	txtrC   pdfr'   rI   pptpptxmdmarkdownrG    zUnsupported file type: u   不支持的文件类型: )lowerr$   r   r?   ri   rX   rl   rn   getreplacer   rQ   r   )clsr   rp   r   parsersparserrC   r   r   r   
parse_file   s"   

zFileParserService.parse_fileN)r   )r   )r@   )rY   )rj   )rm   )ro   )__name__
__module____qualname____doc__staticmethodbytesr   r   r?   rX   ri   rl   rn   classmethodr   r   r   r   r   r      s     $/F$"r   r   c                   C   s   t du rt a t S )u0   获取文件解析服务实例（单例模式）N)_file_parser_servicer   r   r   r   r   get_file_parser_service+  s   r   )r   loggingtypingr   rJ   r-   r5   app.core.exceptionsr   	getLoggerr   r   r   r   r   r   r   r   r   <module>   s    
  