import asyncio
import random
import re

import httpx
from bs4 import BeautifulSoup

from app.services.tax_data_processor.adapters.base import (
    BaseSourceAdapter,
    DocumentItem,
    ParsedDocument,
)

from common_logging import get_logger

logger = get_logger(__name__)

BASE_URL = 'https://12366.chinatax.gov.cn'
LIST_API = f'{BASE_URL}/nszx/onlinemessage/messagelist'
DETAIL_URL = f'{BASE_URL}/nszx/onlinemessage/detail'
_LIST_PARAMS = {'nr': '', 'jg': '', 'zxjg': '', 'lykssj': ''}
_HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Referer': f'{BASE_URL}/nszx/onlinemessage/index'}
_CST_MONTHS = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}

def _parse_cst_date(raw: str) -> str | None:
    m = re.search('(\\w{3})\\s+(\\d{1,2})\\s+[\\d:]+\\s+\\w+\\s+(\\d{4})', raw)
    if m:
        month = _CST_MONTHS.get(m.group(1))
        if month:
            return f'{m.group(3)}-{month:02d}-{int(m.group(2)):02d}'
    return None

class QA12366Adapter(BaseSourceAdapter):

    async def get_total_pages(self) -> int:
        async with httpx.AsyncClient(timeout=30, headers=_HEADERS, follow_redirects=True) as client:
            r = await client.post(LIST_API, data={'currentPage': 1, **_LIST_PARAMS})
            r.raise_for_status()
        return r.json()['maxPage']

    async def list_documents(self, page: int=1) -> list[DocumentItem]:
        for attempt in range(3):
            try:
                async with httpx.AsyncClient(timeout=30, headers=_HEADERS, follow_redirects=True) as client:
                    r = await client.post(LIST_API, data={'currentPage': page, **_LIST_PARAMS})
                    r.raise_for_status()
                return [DocumentItem(url=f"{DETAIL_URL}?id={item['code']}", title=item['title'], date=item['fbsj'], extra={'question': item['content'], 'unitname': item['unitname']}) for item in r.json().get('pageSet', [])]
            except (httpx.TimeoutException, httpx.HTTPStatusError):
                if attempt == 2:
                    raise
                await asyncio.sleep(2 ** attempt)

    async def parse_document(self, item: DocumentItem) -> ParsedDocument | None:
        question = item.extra['question']
        unitname = item.extra['unitname']
        async with httpx.AsyncClient(timeout=30, headers=_HEADERS) as client:
            r = await client.get(item.url)
            r.raise_for_status()
        soup = BeautifulSoup(r.text, 'html.parser')

        def _input_val(label: str) -> str:
            for th in soup.find_all('th'):
                if label in th.text:
                    inp = th.find_next('input')
                    return inp.get('value', '').strip() if inp else ''
            return ''
        reply_authority_raw = _input_val('答复机构') or unitname
        reply_date_raw = _input_val('答复时间')
        reply_date = _parse_cst_date(reply_date_raw) or item.date
        answer = ''
        for th in soup.find_all('th'):
            if '答复内容' in th.text:
                ta = th.find_next('textarea')
                if ta:
                    answer = ta.get_text(strip=True)
                break
        if not answer:
            return None
        first_line = answer.split('\n')[0].strip()
        m = re.match('^(.+?)(答复|回复)[：:]', first_line)
        reply_authority = m.group(1).strip() if m else f'{reply_authority_raw}12366中心'
        taxpayer_region = unitname
        content_text = f'纳税人所属地：{taxpayer_region}\n答复机构：{reply_authority}\n答复时间：{reply_date}\n\n问：{question}\n\n答：{answer}'
        await asyncio.sleep(random.uniform(self.source.request_delay_min or 0.5, self.source.request_delay_max or 1.0))
        return ParsedDocument(source_url=item.url, title=item.title, doc_type='qa', region_code='CN', qa_question=question, qa_answer=answer, content_text=content_text, content_markdown=content_text, content_html=r.text, content_hash=ParsedDocument.compute_hash(content_text), issuing_authority=reply_authority, issue_date=reply_date)
