import os
import subprocess
import tempfile

from docx import Document

from common_logging import get_logger

logger = get_logger(__name__)



class TaxDocumentParser:

    def __init__(self):
        pass

    def parse_docx(self, file_path: str) -> dict[str, str] | None:
        try:
            doc = Document(file_path)
            title = ""
            content_paragraphs = []
            for _i, para in enumerate(doc.paragraphs):
                text = para.text.strip()
                if not text:
                    continue
                if not title and text:
                    title = text
                else:
                    content_paragraphs.append(text)
            content = "\n".join(content_paragraphs)
            if not title:
                title = os.path.basename(file_path).replace(".docx", "").replace(".doc", "")
            logger.info(f"Successfully parsed DOCX file: {file_path}")
            return {"title": title, "content": content, "full_text": title + "\n\n" + content}
        except Exception as e:
            logger.error(f"Failed to parse DOCX file {file_path}: {e}")
            return None

    def parse_doc(self, file_path: str) -> dict[str, str] | None:
        try:
            try:
                return self.parse_docx(file_path)
            except Exception:
                pass
            logger.warning(f"Cannot directly parse .doc file, attempting conversion: {file_path}")
            with tempfile.TemporaryDirectory() as temp_dir:
                try:
                    subprocess.run(
                        [
                            "soffice",
                            "--headless",
                            "--convert-to",
                            "docx",
                            "--outdir",
                            temp_dir,
                            file_path,
                        ],
                        check=True,
                        capture_output=True,
                        timeout=30,
                    )
                    base_name = os.path.basename(file_path).replace(".doc", ".docx")
                    converted_path = os.path.join(temp_dir, base_name)
                    if os.path.exists(converted_path):
                        return self.parse_docx(converted_path)
                except Exception as e:
                    logger.error(f"LibreOffice conversion failed: {e}")
            logger.warning(f"Cannot parse .doc file, returning filename as title: {file_path}")
            return {
                "title": os.path.basename(file_path).replace(".doc", ""),
                "content": f"[Unparseable .doc file: {file_path}]",
                "full_text": os.path.basename(file_path).replace(".doc", ""),
            }
        except Exception as e:
            logger.error(f"Failed to parse DOC file {file_path}: {e}")
            return None

    def parse_file(self, file_path: str) -> dict[str, str] | None:
        if not os.path.exists(file_path):
            logger.error(f"File does not exist: {file_path}")
            return None
        file_ext = os.path.splitext(file_path)[1].lower()
        if file_ext == ".docx":
            return self.parse_docx(file_path)
        elif file_ext == ".doc":
            return self.parse_doc(file_path)
        else:
            logger.error(f"Unsupported file format: {file_ext}")
            return None

    def extract_metadata(self, file_path: str) -> dict[str, any]:
        metadata = {
            "file_name": os.path.basename(file_path),
            "file_size": os.path.getsize(file_path),
            "file_type": os.path.splitext(file_path)[1].lower(),
            "file_path": file_path,
        }
        try:
            if file_path.endswith(".docx"):
                doc = Document(file_path)
                core_props = doc.core_properties
                if core_props.title:
                    metadata["doc_title"] = core_props.title
                if core_props.author:
                    metadata["author"] = core_props.author
                if core_props.created:
                    metadata["created"] = core_props.created
                if core_props.modified:
                    metadata["modified"] = core_props.modified
        except Exception as e:
            logger.warning(f"Failed to extract metadata: {e}")
        return metadata


_parser = None


def get_tax_document_parser() -> TaxDocumentParser:
    global _parser
    if _parser is None:
        _parser = TaxDocumentParser()
    return _parser
