import os
import tempfile

from app.core.exceptions import FileProcessingError

from common_logging import get_logger

logger = get_logger(__name__)




class FileParserService:

    @staticmethod
    def parse_txt(content: bytes, filename: str = "file.txt") -> str:
        try:
            for encoding in ["utf-8", "gbk", "gb2312", "latin-1"]:
                try:
                    return content.decode(encoding)
                except UnicodeDecodeError:
                    continue
            logger.error("Unable to decode TXT file with any supported encoding")
            raise FileProcessingError(filename, "无法使用任何支持的编码解码文件")
        except FileProcessingError:
            raise
        except Exception as e:
            logger.error(f"Failed to parse TXT file: {e}")
            raise FileProcessingError(filename, str(e)) from None

    @staticmethod
    def parse_pdf(content: bytes, filename: str = "file.pdf") -> str:
        try:
            from langchain_community.document_loaders import PyPDFLoader

            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
                tmp_file.write(content)
                tmp_file_path = tmp_file.name
            try:
                loader = PyPDFLoader(tmp_file_path)
                documents = loader.load()
                text_parts = [doc.page_content for doc in documents if doc.page_content.strip()]
                result = "\n\n".join(text_parts)
                logger.info(
                    f"Successfully parsed PDF, {len(documents)} pages, extracted {len(result)} characters"
                )
                return result
            finally:
                if os.path.exists(tmp_file_path):
                    os.unlink(tmp_file_path)
        except FileProcessingError:
            raise
        except Exception as e:
            logger.error(f"Failed to parse PDF file: {e}")
            raise FileProcessingError(filename, str(e)) from None

    @staticmethod
    def parse_docx(content: bytes, filename: str = "file.docx") -> str:
        try:
            from io import BytesIO

            import docx

            doc = docx.Document(BytesIO(content))
            paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
            tables_text = []
            for table in doc.tables:
                for row in table.rows:
                    row_text = " | ".join(
                        cell.text.strip() for cell in row.cells if cell.text.strip()
                    )
                    if row_text:
                        tables_text.append(row_text)
            all_text = paragraphs + tables_text
            result = "\n\n".join(all_text)
            if not result.strip():
                logger.warning(f"Extracted empty content from {filename}")
                raise FileProcessingError(filename, "文档内容为空")
            if result.startswith("ÐÏà¡±á") or "\x00" in result[:100]:
                logger.error(f"Extracted binary content from {filename}")
                raise FileProcessingError(filename, "提取的内容包含二进制数据，文件可能已损坏")
            logger.info(f"Successfully parsed .docx document, extracted {len(result)} characters")
            return result
        except FileProcessingError:
            raise
        except Exception as e:
            logger.error(f"Failed to parse .docx document: {e}")
            raise FileProcessingError(filename, str(e)) from None

    @staticmethod
    def parse_doc(content: bytes, filename: str = "file.doc") -> str:
        import subprocess

        with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp_file:
            tmp_file.write(content)
            tmp_file_path = tmp_file.name
        try:
            try:
                result = subprocess.run(
                    ["antiword", tmp_file_path], capture_output=True, text=True, timeout=30
                )
                if result.returncode == 0 and result.stdout.strip():
                    text = result.stdout.strip()
                    logger.info(
                        f"antiword successfully parsed .doc file, extracted {len(text)} characters"
                    )
                    return text
                else:
                    logger.warning(f"antiword parsing failed: {result.stderr}")
            except Exception as e:
                logger.warning(f"antiword execution failed: {e}")
            try:
                from langchain_community.document_loaders import Docx2txtLoader

                logger.info("Attempting to use Docx2txtLoader...")
                loader = Docx2txtLoader(tmp_file_path)
                documents = loader.load()
                text_parts = [doc.page_content for doc in documents if doc.page_content.strip()]
                result = "\n\n".join(text_parts)
                if result:
                    logger.info(
                        f"Docx2txtLoader successfully parsed file, extracted {len(result)} characters"
                    )
                    return result
            except Exception as docx_error:
                logger.warning(f"Docx2txtLoader also failed: {docx_error}")
            try:
                logger.info("Attempting to read as plain text...")
                for encoding in ["utf-8", "gbk", "gb2312", "latin-1", "cp1252"]:
                    try:
                        text = content.decode(encoding)
                        if text and len(text.strip()) > 10:
                            logger.info(
                                f"Successfully read file with {encoding} encoding, extracted {len(text)} characters"
                            )
                            return text
                    except (UnicodeDecodeError, AttributeError):
                        continue
            except Exception as text_error:
                logger.warning(f"Plain text reading also failed: {text_error}")
            logger.error(
                "All .doc parsing methods failed. Suggestions: 1) Verify file is standard Word format; 2) Try saving file as .docx format; 3) Open and re-save with another text editor"
            )
            raise FileProcessingError(
                filename, "所有解析方法均失败，请验证文件格式或转换为.docx格式"
            )
        finally:
            if os.path.exists(tmp_file_path):
                os.unlink(tmp_file_path)

    @staticmethod
    def parse_pptx(content: bytes, filename: str = "file.pptx") -> str:
        try:
            from langchain_community.document_loaders import UnstructuredPowerPointLoader


            with tempfile.NamedTemporaryFile(delete=False, suffix=".pptx") as tmp_file:
                tmp_file.write(content)
                tmp_file_path = tmp_file.name
            try:
                loader = UnstructuredPowerPointLoader(tmp_file_path)
                documents = loader.load()
                text_parts = [doc.page_content for doc in documents if doc.page_content.strip()]
                result = "\n\n".join(text_parts)
                logger.info(f"Successfully parsed PPT, extracted {len(result)} characters")
                return result
            finally:
                if os.path.exists(tmp_file_path):
                    os.unlink(tmp_file_path)
        except FileProcessingError:
            raise
        except Exception as e:
            logger.error(f"Failed to parse PPT document: {e}")
            raise FileProcessingError(filename, str(e)) from None

    @staticmethod
    def parse_md(content: bytes, filename: str = "file.md") -> str:
        try:
            for encoding in ["utf-8", "gbk", "gb2312"]:
                try:
                    return content.decode(encoding)
                except UnicodeDecodeError:
                    continue
            logger.error("Unable to decode Markdown file")
            raise FileProcessingError(filename, "无法使用任何支持的编码解码Markdown文件")
        except FileProcessingError:
            raise
        except Exception as e:
            logger.error(f"Failed to parse Markdown file: {e}")
            raise FileProcessingError(filename, str(e)) from None

    @classmethod
    def parse_file(cls, content: bytes, file_type: str, filename: str = "file") -> str:
        file_type = file_type.lower().strip(".")
        parsers = {
            "txt": cls.parse_txt,
            "text": cls.parse_txt,
            "pdf": cls.parse_pdf,
            "doc": cls.parse_doc,
            "docx": cls.parse_docx,
            "ppt": cls.parse_pptx,
            "pptx": cls.parse_pptx,
            "md": cls.parse_md,
            "markdown": cls.parse_md,
        }
        parser = parsers.get(file_type)
        if parser:
            text = parser(content, filename)
            return text.replace("\x00", "")
        else:
            logger.warning(f"Unsupported file type: {file_type}")
            raise FileProcessingError(filename, f"不支持的文件类型: {file_type}")


_file_parser_service = None


def get_file_parser_service() -> FileParserService:
    global _file_parser_service
    if _file_parser_service is None:
        _file_parser_service = FileParserService()
    return _file_parser_service
