import hashlib
from pathlib import Path

import httpx
from common_logging import get_logger

logger = get_logger(__name__)


class AttachmentParser:

    def __init__(self, timeout: int=30, max_retries: int=3):
        self.timeout = timeout
        self.max_retries = max_retries

    async def download_attachment(self, url: str, save_path: str, headers: dict | None=None, verify_hash: str | None=None, base_dir: Path | None=None) -> dict:
        if headers is None:
            headers = {}
        if 'Referer' not in headers and 'chinatax.gov.cn' in url:
            from urllib.parse import urlparse
            parsed = urlparse(url)
            headers['Referer'] = f'{parsed.scheme}://{parsed.netloc}/'
        save_dir = Path(save_path).parent
        save_dir.mkdir(parents=True, exist_ok=True)
        if Path(save_path).exists():
            if verify_hash:
                file_hash = self._calculate_file_hash(save_path)
                if file_hash == verify_hash:
                    final_path = save_path
                    if base_dir is not None:
                        try:
                            final_path = str(Path(save_path).relative_to(base_dir))
                        except ValueError:
                            pass
                    return {'success': True, 'path': final_path, 'size': Path(save_path).stat().st_size, 'message': '文件已存在且校验通过'}
                else:
                    logger.warning('文件哈希不匹配，重新下载')
        retry_count = 0
        last_error = None
        while retry_count < self.max_retries:
            try:
                async with httpx.AsyncClient(timeout=self.timeout) as client:
                    async with client.stream('GET', url, headers=headers, follow_redirects=True) as response:
                        response.raise_for_status()
                        file_hash_builder = hashlib.sha256()
                        file_size = 0
                        with open(save_path, 'wb') as f:
                            async for chunk in response.aiter_bytes(chunk_size=8192):
                                if not chunk:
                                    continue
                                f.write(chunk)
                                file_hash_builder.update(chunk)
                                file_size += len(chunk)
                        file_hash = file_hash_builder.hexdigest()
                    logger.info(f'附件下载成功: {save_path} ({file_size} bytes)')
                    if verify_hash and file_hash != verify_hash:
                        logger.error('文件哈希校验失败')
                        return {'success': False, 'error': '文件哈希校验失败'}
                    final_path = save_path
                    if base_dir is not None:
                        try:
                            final_path = str(Path(save_path).relative_to(base_dir))
                        except ValueError:
                            pass
                    return {'success': True, 'path': final_path, 'size': file_size, 'hash': file_hash, 'message': '下载成功'}
            except Exception as e:
                retry_count += 1
                last_error = str(e)
                logger.warning(f'下载失败 (尝试 {retry_count}/{self.max_retries}): {e}')
                if retry_count < self.max_retries:
                    import asyncio
                    await asyncio.sleep(2 ** retry_count)
        logger.error(f'附件下载失败: {url}, 错误: {last_error}')
        return {'success': False, 'error': last_error}

    def _calculate_file_hash(self, file_path: str) -> str:
        sha256_hash = hashlib.sha256()
        with open(file_path, 'rb') as f:
            for byte_block in iter(lambda: f.read(4096), b''):
                sha256_hash.update(byte_block)
        return sha256_hash.hexdigest()

    def get_file_extension(self, url: str) -> str:
        import os
        from urllib.parse import urlparse
        parsed = urlparse(url)
        _, ext = os.path.splitext(parsed.path)
        return ext.lower()

    def validate_file_type(self, file_path: str) -> bool:
        file_signatures = {b'%PDF': 'pdf', b'\xd0\xcf\x11\xe0': 'doc/xls', b'PK\x03\x04': 'docx/xlsx'}
        try:
            with open(file_path, 'rb') as f:
                header = f.read(8)
            for signature, file_type in file_signatures.items():
                if header.startswith(signature):
                    logger.debug(f'文件类型验证通过: {file_type}')
                    return True
            logger.warning(f'未识别的文件类型: {file_path}')
            return False
        except Exception as e:
            logger.error(f'文件类型验证失败: {e}')
            return False
