import asyncio
import hashlib
import os
import re
import shutil
from pathlib import Path

import httpx
from common_logging import get_logger

logger = get_logger(__name__)

_LIBREOFFICE_BINS = ['libreoffice', 'soffice', '/usr/bin/libreoffice', '/usr/bin/soffice', '/Applications/LibreOffice.app/Contents/MacOS/soffice']
_CONVERT_TIMEOUT = 120
_CHUNK_SIZE = 8192

class DocConverter:

    def __init__(self, save_root: str='/tmp/tax_documents'):
        self.save_root = Path(save_root)
        self._lo_bin = self._find_libreoffice()

    async def process(self, url: str, category_id: int, raw_dir: str, headers: dict | None=None) -> dict:
        ext = self._get_ext(url)
        filename = self._url_to_filename(url)
        raw_path = Path(raw_dir) / filename
        raw_path.parent.mkdir(parents=True, exist_ok=True)
        download_result = await self._download(url, str(raw_path), headers)
        if not download_result['success']:
            return {'success': False, 'markdown': '', 'raw_path': str(raw_path), 'file_hash': '', 'file_size': 0, 'parse_quality_score': 0.0, 'error': download_result.get('error', '下载失败')}
        file_hash = download_result['file_hash']
        file_size = download_result['file_size']
        try:
            if ext in ('.doc', '.docx', '.wps'):
                markdown, score = await self._doc_to_markdown(str(raw_path), ext)
            elif ext == '.pdf':
                markdown, score = self._pdf_to_markdown(str(raw_path))
            elif ext in ('.xls', '.xlsx'):
                markdown, score = self._xls_to_markdown(str(raw_path))
            elif ext in ('.ppt', '.pptx'):
                markdown, score = await self._ppt_to_markdown(str(raw_path), ext)
            else:
                logger.warning(f'[DocConverter] 不支持的格式 {ext}: {url}')
                markdown, score = ('', 0.0)
        except Exception as e:
            logger.error(f'[DocConverter] 转换失败: {raw_path} — {e}'.opt(exception=True))
            markdown, score = ('', 0.0)
        return {'success': True, 'markdown': markdown, 'raw_path': str(raw_path), 'file_hash': file_hash, 'file_size': file_size, 'parse_quality_score': score, 'error': '' if score > 0 else 'Markdown转换失败，请人工复查'}

    async def convert_local(self, local_path: str) -> dict:
        ext = self._detect_real_ext(local_path)
        try:
            if ext in ('.doc', '.docx', '.wps'):
                markdown, score = await self._doc_to_markdown(local_path, ext)
            elif ext == '.pdf':
                markdown, score = self._pdf_to_markdown(local_path)
            elif ext in ('.xls', '.xlsx'):
                markdown, score = self._xls_to_markdown(local_path)
            elif ext in ('.ppt', '.pptx'):
                markdown, score = await self._ppt_to_markdown(local_path, ext)
            else:
                logger.warning(f'[DocConverter] convert_local 不支持的格式 {ext}: {local_path}')
                return {'success': False, 'markdown': '', 'parse_quality_score': 0.0}
        except Exception as e:
            logger.error(f'[DocConverter] convert_local 失败: {local_path} — {e}')
            return {'success': False, 'markdown': '', 'parse_quality_score': 0.0}
        return {'success': score > 0, 'markdown': markdown, 'parse_quality_score': score}

    async def _download(self, url: str, save_path: str, headers: dict | None) -> dict:
        h = headers or {}
        try:
            async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
                async with client.stream('GET', url, headers=h) as resp:
                    resp.raise_for_status()
                    hasher = hashlib.sha256()
                    size = 0
                    with open(save_path, 'wb') as f:
                        async for chunk in resp.aiter_bytes(_CHUNK_SIZE):
                            if not chunk:
                                continue
                            f.write(chunk)
                            hasher.update(chunk)
                            size += len(chunk)
            logger.info(f'[DocConverter] 下载完成: {save_path} ({size} bytes)')
            return {'success': True, 'file_hash': hasher.hexdigest(), 'file_size': size}
        except Exception as e:
            logger.error(f'[DocConverter] 下载失败: {url} — {e}')
            return {'success': False, 'error': str(e)}

    async def _doc_to_markdown(self, raw_path: str, ext: str) -> tuple[str, float]:
        if ext in ('.doc', '.wps'):
            stem = Path(raw_path).stem
            docx_candidate = str(Path(raw_path).parent / f'{stem}.docx')
            if Path(docx_candidate).exists():
                docx_path = docx_candidate
            else:
                docx_path = await self._convert_to_docx(raw_path)
            if docx_path:
                text = self._extract_docx_text(docx_path)
                if text.strip():
                    return (self._text_to_markdown(text), min(1.0, len(text.strip()) / 500))
            text = await self._textutil_to_text(raw_path)
            if text.strip():
                return (self._textutil_text_to_markdown(text), min(0.8, len(text.strip()) / 500))
            return ('', 0.0)
        else:
            docx_path = raw_path
        text = self._extract_docx_text(docx_path)
        if not text.strip():
            return ('', 0.2)
        markdown = self._text_to_markdown(text)
        score = min(1.0, len(markdown.strip()) / 500)
        return (markdown, score)

    async def _textutil_to_text(self, doc_path: str) -> str:
        import shutil
        if not shutil.which('textutil'):
            return ''
        txt_path = str(Path(doc_path).with_suffix('.txt'))
        cmd = ['textutil', '-convert', 'txt', '-output', txt_path, doc_path]
        try:
            proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
            _, stderr = await asyncio.wait_for(proc.communicate(), timeout=30)
            if proc.returncode != 0:
                logger.warning(f"[DocConverter] textutil 转换失败: {stderr.decode(errors='replace')[:200]}")
                return ''
            if Path(txt_path).exists():
                text = Path(txt_path).read_text(encoding='utf-8', errors='replace')
                return text
        except Exception as e:
            logger.warning(f'[DocConverter] textutil 异常: {e}')
        return ''

    def _textutil_text_to_markdown(self, text: str) -> str:
        lines = text.split('\n')
        result = []
        table_rows: list = []

        def flush_table():
            if not table_rows:
                return
            rows_html = []
            for i, row in enumerate(table_rows):
                cells = [c.strip() for c in row.split('\x07')]
                while cells and cells[-1] == '':
                    cells.pop()
                if not any(cells):
                    continue
                tag = 'th' if i == 0 else 'td'
                cells_html = ''.join(f'<{tag}>{c}</{tag}>' for c in cells)
                rows_html.append(f'<tr>{cells_html}</tr>')
            if rows_html:
                result.append("<table border='1'><tbody>" + ''.join(rows_html) + '</tbody></table>')
            table_rows.clear()
        for line in lines:
            if '\x07' in line:
                table_rows.append(line)
            else:
                flush_table()
                result.append(line)
        flush_table()
        joined = '\n'.join(result)
        return self._text_to_markdown(joined)

    async def _convert_to_docx(self, doc_path: str) -> str | None:
        if not self._lo_bin:
            logger.error('[DocConverter] 未找到 LibreOffice，无法转换 DOC')
            return None
        out_dir = str(Path(doc_path).parent)
        cmd = [self._lo_bin, '--headless', '--convert-to', 'docx', '--outdir', out_dir, doc_path]
        try:
            proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
            stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=_CONVERT_TIMEOUT)
            if proc.returncode != 0:
                logger.error(f"[DocConverter] LibreOffice 转换失败: returncode={proc.returncode} stderr={stderr.decode(errors='replace')[:300]}")
                return None
            stem = Path(doc_path).stem
            docx_path = str(Path(out_dir) / f'{stem}.docx')
            if Path(docx_path).exists():
                return docx_path
            else:
                logger.error(f'[DocConverter] 生成的 .docx 不存在: {docx_path}')
                return None
        except TimeoutError:
            logger.error(f'[DocConverter] LibreOffice 转换超时: {doc_path}')
            try:
                proc.kill()
            except Exception:
                pass
            return None
        except Exception as e:
            logger.error(f'[DocConverter] LibreOffice 转换异常: {e}')
            return None

    def _extract_docx_text(self, docx_path: str) -> str:
        try:
            import docx
            doc = docx.Document(docx_path)
            parts = []
            body = doc.element.body
            for child in body.iterchildren():
                tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
                if tag == 'p':
                    para = docx.text.paragraph.Paragraph(child, doc)
                    text = para.text.strip()
                    if text:
                        parts.append(text)
                elif tag == 'tbl':
                    tbl = docx.table.Table(child, doc)
                    html = self._docx_table_to_html(tbl)
                    if html:
                        parts.append(html)
            return '\n\n'.join(parts)
        except ImportError:
            pass
        except Exception as e:
            logger.warning(f'[DocConverter] python-docx 解析失败: {e}')
        try:
            import docx2txt
            return docx2txt.process(docx_path)
        except ImportError:
            pass
        except Exception as e:
            logger.warning(f'[DocConverter] docx2txt 解析失败: {e}')
        logger.error('[DocConverter] 无可用的 DOCX 解析库（需要 python-docx 或 docx2txt）')
        return ''

    def _ocr_docx_images(self, doc) -> list:
        from app.services.tax_data_processor.ocr.image_ocr import ocr_image_bytes
        ocr_parts = []
        try:
            for i, shape in enumerate(doc.inline_shapes):
                try:
                    rId = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
                    image_part = doc.part.related_parts[rId]
                    img_bytes = image_part.blob
                    ocr_text = ocr_image_bytes(img_bytes)
                    if ocr_text.strip():
                        ocr_parts.append(f'<!-- 图片{i + 1} OCR -->\n{ocr_text}')
                except Exception as e:
                    logger.debug(f'[DocConverter] DOCX 图片{i + 1} OCR 跳过: {e}')
        except Exception as e:
            logger.debug(f'[DocConverter] 枚举 DOCX inline_shapes 失败: {e}')
        return ocr_parts

    def _docx_table_to_html(self, tbl) -> str:
        rows_html = []
        for i, row in enumerate(tbl.rows):
            cells_html = []
            for cell in row.cells:
                text = cell.text.strip().replace('\n', '<br>')
                tag = 'th' if i == 0 else 'td'
                cells_html.append(f'<{tag}>{text}</{tag}>')
            rows_html.append('<tr>' + ''.join(cells_html) + '</tr>')
        return "<table border='1'><tbody>" + ''.join(rows_html) + '</tbody></table>'

    def _pdf_to_markdown(self, pdf_path: str) -> tuple[str, float]:
        text = ''
        try:
            from pdfminer.high_level import extract_text
            text = extract_text(pdf_path)
        except ImportError:
            pass
        except Exception as e:
            logger.warning(f'[DocConverter] pdfminer 解析失败: {e}')
        if not text.strip():
            try:
                import fitz
                with fitz.open(pdf_path) as pdf:
                    text = '\n\n'.join(page.get_text() for page in pdf)
            except ImportError:
                pass
            except Exception as e:
                logger.warning(f'[DocConverter] pymupdf 解析失败: {e}')
        if not text.strip():
            logger.error(f'[DocConverter] PDF 文本提取失败（需要 pdfminer.six 或 PyMuPDF）: {pdf_path}')
            return ('', 0.0)
        markdown = self._text_to_markdown(text)
        score = min(1.0, len(markdown.strip()) / 500)
        return (markdown, score)

    def _pdf_ocr_text(self, pdf_path: str) -> str:
        try:
            import fitz
            import numpy as np
            from PIL import Image
        except ImportError as e:
            logger.warning(f'[DocConverter] OCR 依赖缺失: {e}')
            return ''
        try:
            from paddleocr import PaddleOCR
            ocr = PaddleOCR(use_textline_orientation=True, lang='ch')
        except ImportError:
            logger.warning('[DocConverter] PaddleOCR 未安装，跳过扫描版 OCR')
            return ''
        except Exception as e:
            logger.warning(f'[DocConverter] PaddleOCR 初始化失败: {e}')
            return ''
        page_texts = []
        try:
            with fitz.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf):
                    try:
                        pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
                        img = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
                        img_array = np.array(img)
                        result = ocr.predict(img_array)
                        lines = []
                        for item in result:
                            texts = item.get('rec_texts', [])
                            scores = item.get('rec_scores', [])
                            for t, s in zip(texts, scores, strict=False):
                                if s >= 0.5 and t.strip():
                                    lines.append(t.strip())
                        if lines:
                            page_texts.append('\n'.join(lines))
                    except Exception as e:
                        logger.warning(f'[DocConverter] OCR 第{page_num + 1}页失败: {e}')
        except Exception as e:
            logger.error(f'[DocConverter] OCR 打开 PDF 失败: {e}')
            return ''
        return '\n\n'.join(page_texts)

    def _xls_to_markdown(self, xls_path: str) -> tuple[str, float]:
        parts = []
        try:
            import openpyxl
            wb = openpyxl.load_workbook(xls_path, read_only=True, data_only=True)
            for sheet_name in wb.sheetnames:
                ws = wb[sheet_name]
                rows = list(ws.iter_rows(values_only=True))
                if not rows:
                    continue
                html = self._rows_to_html_table([[str(c) if c is not None else '' for c in row] for row in rows])
                parts.append(f'### {sheet_name}\n\n{html}')
            wb.close()
            if parts:
                markdown = '\n\n'.join(parts)
                return (markdown, min(1.0, len(markdown.strip()) / 500))
        except ImportError:
            pass
        except Exception as e:
            logger.warning(f'[DocConverter] openpyxl 解析失败: {e}')
        try:
            import xlrd
            wb = xlrd.open_workbook(xls_path)
            for sheet in wb.sheets():
                rows = [[str(sheet.cell_value(r, c)) for c in range(sheet.ncols)] for r in range(sheet.nrows)]
                if not rows:
                    continue
                html = self._rows_to_html_table(rows)
                parts.append(f'### {sheet.name}\n\n{html}')
            if parts:
                markdown = '\n\n'.join(parts)
                return (markdown, min(1.0, len(markdown.strip()) / 500))
        except ImportError:
            pass
        except Exception as e:
            logger.warning(f'[DocConverter] xlrd 解析失败: {e}')
        logger.error(f'[DocConverter] XLS 解析失败（需要 openpyxl 或 xlrd）: {xls_path}')
        return ('', 0.0)

    async def _ppt_to_markdown(self, ppt_path: str, ext: str) -> tuple[str, float]:
        if ext == '.ppt':
            stem = Path(ppt_path).stem
            pptx_candidate = str(Path(ppt_path).parent / f'{stem}.pptx')
            if Path(pptx_candidate).exists():
                pptx_path = pptx_candidate
            else:
                pptx_path = await self._convert_to_pptx(ppt_path)
            if not pptx_path:
                return ('', 0.0)
        else:
            pptx_path = ppt_path
        return self._extract_pptx_markdown(pptx_path)

    async def _convert_to_pptx(self, ppt_path: str) -> str | None:
        if not self._lo_bin:
            logger.error('[DocConverter] 未找到 LibreOffice，无法转换 PPT')
            return None
        out_dir = str(Path(ppt_path).parent)
        cmd = [self._lo_bin, '--headless', '--convert-to', 'pptx', '--outdir', out_dir, ppt_path]
        try:
            proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
            stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=_CONVERT_TIMEOUT)
            if proc.returncode != 0:
                logger.error(f"[DocConverter] LibreOffice PPT→PPTX 失败: returncode={proc.returncode} stderr={stderr.decode(errors='replace')[:300]}")
                return None
            stem = Path(ppt_path).stem
            pptx_path = str(Path(out_dir) / f'{stem}.pptx')
            if Path(pptx_path).exists():
                return pptx_path
            else:
                logger.error(f'[DocConverter] 生成的 .pptx 不存在: {pptx_path}')
                return None
        except TimeoutError:
            logger.error(f'[DocConverter] LibreOffice PPT 转换超时: {ppt_path}')
            try:
                proc.kill()
            except Exception:
                pass
            return None
        except Exception as e:
            logger.error(f'[DocConverter] LibreOffice PPT 转换异常: {e}')
            return None

    def _extract_pptx_markdown(self, pptx_path: str) -> tuple[str, float]:
        try:
            from pptx import Presentation
            prs = Presentation(pptx_path)
            slide_parts = []
            for slide_num, slide in enumerate(prs.slides, 1):
                parts = []
                for shape in slide.shapes:
                    if shape.has_text_frame:
                        lines = []
                        for para in shape.text_frame.paragraphs:
                            text = para.text.strip()
                            if text:
                                lines.append(text)
                        if lines:
                            parts.append('\n'.join(lines))
                    elif shape.has_table:
                        tbl = shape.table
                        rows = []
                        for row in tbl.rows:
                            cells = [cell.text.strip() for cell in row.cells]
                            rows.append(cells)
                        seen_rows = []
                        for row in rows:
                            if row not in seen_rows:
                                seen_rows.append(row)
                        if seen_rows:
                            parts.append(self._rows_to_html_table(seen_rows))
                    elif shape.shape_type == 13:
                        try:
                            from app.services.tax_data_processor.ocr.image_ocr import (
                                ocr_image_bytes,
                            )
                            img_bytes = shape.image.blob
                            ocr_text = ocr_image_bytes(img_bytes)
                            if ocr_text.strip():
                                parts.append(f'<!-- 图片 OCR -->\n{ocr_text}')
                        except Exception as e:
                            logger.debug(f'[DocConverter] PPTX 图片 OCR 跳过: {e}')
                if parts:
                    heading = f'### 第{slide_num}页'
                    slide_parts.append(heading + '\n\n' + '\n\n'.join(parts))
            if not slide_parts:
                return ('', 0.0)
            text = '\n\n'.join(slide_parts)
            markdown = self._text_to_markdown(text)
            score = min(1.0, len(markdown.strip()) / 500)
            return (markdown, score)
        except ImportError:
            logger.error('[DocConverter] 需要 python-pptx（pip install python-pptx）')
            return ('', 0.0)
        except Exception as e:
            logger.error(f'[DocConverter] PPTX 解析失败: {pptx_path} — {e}'.opt(exception=True))
            return ('', 0.0)

    @staticmethod
    def _rows_to_html_table(rows: list) -> str:
        rows_html = []
        for i, row in enumerate(rows):
            tag = 'th' if i == 0 else 'td'
            cells_html = ''.join(f'<{tag}>{c}</{tag}>' for c in row)
            rows_html.append(f'<tr>{cells_html}</tr>')
        return "<table border='1'><tbody>" + ''.join(rows_html) + '</tbody></table>'

    def _text_to_markdown(self, text: str) -> str:
        text = re.sub('INCLUDEPICTURE[^\\n]*', '', text)
        text = re.sub('MERGEFORMATINET[^\\n]*', '', text)
        if '<table' in text:
            try:
                from app.services.tax_data_processor.document_cleaner import TaxMarkdownConverter
                md = TaxMarkdownConverter(heading_style='ATX', bullets='-', strip=['script', 'style']).convert(text)
                md = re.sub('\\n{3,}', '\n\n', md)
                return md.strip()
            except Exception:
                pass
        text = re.sub('\\r\\n', '\n', text)
        text = re.sub('\\r', '\n', text)
        text = re.sub('\\n{3,}', '\n\n', text)
        lines = [line.rstrip() for line in text.splitlines()]
        return '\n'.join(lines).strip()

    @staticmethod
    def _get_ext(url: str) -> str:
        from urllib.parse import urlparse
        parsed = urlparse(url)
        _, ext = os.path.splitext(parsed.path)
        return ext.lower()

    @staticmethod
    def _url_to_filename(url: str) -> str:
        from urllib.parse import unquote, urlparse
        parsed = urlparse(url)
        name = unquote(os.path.basename(parsed.path))
        name = re.sub('[<>:"/\\\\|?*\\x00-\\x1f]', '_', name)
        return name or 'download'

    @staticmethod
    def _detect_real_ext(path: str) -> str:
        declared = Path(path).suffix.lower()
        try:
            with open(path, 'rb') as f:
                header = f.read(8)
            if header[:8] == b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
                if declared in ('.docx', '.doc', '.wps'):
                    return '.doc'
                if declared in ('.xlsx', '.xls'):
                    return '.xls'
                if declared in ('.pptx', '.ppt'):
                    return '.ppt'
        except Exception:
            pass
        return declared

    @staticmethod
    def _find_libreoffice() -> str | None:
        for bin_path in _LIBREOFFICE_BINS:
            found = shutil.which(bin_path)
            if found:
                return found
        logger.warning('[DocConverter] 未找到 LibreOffice，DOC→DOCX 转换不可用。 请安装：apt-get install libreoffice 或 brew install libreoffice')
        return None