import hashlib
import re
from collections.abc import AsyncIterator

import httpx
from bs4 import BeautifulSoup

from app.services.tax_data_processor.adapters.base import (

    BaseSourceAdapter,
    DocumentItem,
    ParsedDocument,
)
from app.services.tax_data_processor.document_cleaner import DocumentCleaner
from app.services.tax_data_processor.parsers.html_parser import HTMLParser

from common_logging import get_logger

logger = get_logger(__name__)

_cleaner = DocumentCleaner()
_html_parser = HTMLParser()
BASE_URL = 'https://guangdong.chinatax.gov.cn'
LIST_URL = f'{BASE_URL}/siteapps/webpage/gdtax/fgk/ssfgk/policy_list.jsp'
PAGE_SIZE = 20
LOCAL_POLICY_CHANNELS = ['0af77827a1b5428198734a86443a5aa9', '08b462774948428a8e5324af6c0ff0dc', 'bd22f2d4b3ed47ed8a46d94012a065a1', '5aeb84c67a324b5a902e1fb1f26f9ef1', 'b5b479dbcede424db4800688a1e436de', '5d9a4ef2ddda4d52890019891037b0a5', '29b8d020942f4423a42082cbb574b1b3', '958af3cc13d04ebd89f7d9473f481ebf', '83c7be78839d4f61a38399b3d65c52e1', 'ff6dedf915b5403fb65017627e538a87', '75d5e02927374894b37abcc28383246e', '2b27f76b2dbf4d6da1d49afe001906c5', '5445c97bf2544f4fbe41bd36aeb54eb4', '264fe7add79246d7af2f6a40e2a15374', '0011b0e6925046b7ae5daa05e692f103', 'e918dc1f349e4e7b9288e3486da20b5d', 'fd87ea2c8f6347e7aacfa735e1c5813c', '5196669f9d4149bd8445511e1d537ad4', '8063fa1779354b05848ca43cd4fac595', 'abea13d934574515a7d712c5b6b6b264', '07126a2a8e0d4db78de86d9c7b5bd463', '877dd90f062c4aa8aafcd64c3c4951c1']
NORMATIVE_CHANNELS = ['ac3e9d4341394111beb8118e84227ed1', '5eed470b7010478fb14f5e3ad33ca990', 'e03078f612f24fe8b95c400cdfc0c516', 'a3b095cbcb3a4c7dacc4da2f73e5af5d', 'abf39049e05f4522ba34db5aba65f4ef', 'dd1e22e66a7a4dbc877be62665689fc6', '88233c27796748848695f265a5d33b2f', 'bd6662400b5a4cf082bd0ee33f335af0', '39d2bf87d4a849a89d63809e50cb5d32', '296e0fb39b67457eb5744883021d4ea6', 'eec330825db24aa7b3db391385865c6b', 'b41d2d4a4fcc48b1b98fcf92fa867783', '34c62f99481e43e28e2ad9fae40469b8', 'cbe0a772ff9544db874acc6ca1c1ac42', '87fcee944fa44aa7bc9c0e2d8c6702d3', '90827d02107549d89a38cbf025048692', 'baef14f5478840c1961f81e02edddccb', 'c677b8cf98b5446cb6c909929df3e380', '21eec7af272b4038896ee6ad5cb12f00', 'e5989a17f97e4d6580fecc58a4ef1edc', '8e21a59311f04ab99e8b15e66b02f37c', '03bb43c18fca4eb28a4a2e8a3829fff7']

async def _fetch_channel_page(client: httpx.AsyncClient, channel_id: str, page: int) -> str:
    resp = await client.post(LIST_URL, data={'pageNo': page, 'pageSize': PAGE_SIZE, 'channelId': channel_id})
    return resp.text

def _parse_total_pages(html: str) -> int:
    m = re.search('共(\\d+)页', html)
    return int(m.group(1)) if m else 1

def _parse_items(html: str) -> list[DocumentItem]:
    soup = BeautifulSoup(html, 'html.parser')
    items = []
    for li in soup.select('li.clearfix'):
        a = li.select_one('span.title a[href]')
        if not a:
            continue
        href = a['href']
        if 'content_' not in href:
            continue
        raw_title = a.get('title') or a.get_text(strip=True)
        title = BeautifulSoup(raw_title, 'html.parser').get_text(' ', strip=True)
        date_span = li.select_one('span.fwrq')
        date = date_span.get_text(strip=True) if date_span else None
        has_interp = bool(li.select_one('em.jdIcon'))
        if date and re.match('\\d{4}', date):
            items.append(DocumentItem(url=BASE_URL + href, title=title, date=date, extra={'has_interpretation': has_interp}))
    return items

class GdtaxAdapter(BaseSourceAdapter):

    async def get_total_pages(self) -> int:
        return 1

    async def list_documents(self, page: int=1) -> list[DocumentItem]:
        return []

    async def list_all_documents(self) -> AsyncIterator[DocumentItem]:
        channels = [(cid, 'local_policy') for cid in LOCAL_POLICY_CHANNELS] + [(cid, 'normative') for cid in NORMATIVE_CHANNELS]
        async with httpx.AsyncClient(timeout=30, headers={'User-Agent': 'Mozilla/5.0'}) as client:
            for channel_id, doc_type in channels:
                page = 1
                while True:
                    html = await _fetch_channel_page(client, channel_id, page)
                    items = _parse_items(html)
                    for item in items:
                        item.extra['doc_type'] = doc_type
                        yield item
                        resp = await client.get(item.url)
                        soup = BeautifulSoup(resp.text, 'html.parser')
                        for a in soup.select('div.rel_news.xgjd ul.infoList a[href]'):
                            href = a['href']
                            if 'content_' not in href:
                                continue
                            from urllib.parse import urljoin

                            yield DocumentItem(url=urljoin(BASE_URL, href), title=a.get_text(strip=True), date=item.date, extra={'doc_type': 'interpretation', 'interprets_url': item.url})
                    if not items or page >= _parse_total_pages(html):
                        break
                    page += 1

    async def parse_document(self, item: DocumentItem) -> ParsedDocument | None:
        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
            resp = await client.get(item.url, headers={'User-Agent': 'Mozilla/5.0'})
        if resp.status_code != 200:
            return None
        soup = BeautifulSoup(resp.text, 'html.parser')
        doc_type = item.extra.get('doc_type', 'local_policy')
        attachments = _html_parser.extract_attachments(soup, item.url) or None
        content_div = soup.select_one('#zoomcon')
        if not content_div:
            return None
        content_html = str(content_div)
        content_markdown = _cleaner.html_to_markdown(content_html)
        content_text = content_div.get_text('\n', strip=True)
        content_hash = hashlib.sha256(content_text.encode()).hexdigest()
        if doc_type == 'interpretation':
            refs = []
            for a in soup.select('div.rel_news.xgwj ul.infoList a[href]'):
                href = a['href']
                if 'content_' in href:
                    refs.append({'source_url': BASE_URL + href, 'title': a.get_text(strip=True)})
            return ParsedDocument(source_url=item.url, title=item.title, doc_type='interpretation', region_code='CN-GD', content_markdown=content_markdown, content_text=content_text, content_hash=content_hash, content_html=content_html, doc_status='effective', references=refs if refs else None, attachments=attachments)
        memo = soup.select_one('div.memo')
        doc_number = memo.get_text(strip=True) if memo else None
        issuing_authority = None
        for dd in soup.select('div.meta_data dd'):
            text = dd.get_text(strip=True)
            if text.startswith('发布机构:') or text.startswith('发布机构：'):
                issuing_authority = re.sub('^发布机构[:：]', '', text).strip()
                break
        issue_date = None
        date_span = soup.select_one('#lawfwrq')
        if date_span:
            raw = date_span.get_text(strip=True)
            m = re.search('(\\d{4})-(\\d{1,2})-(\\d{1,2})', raw)
            if m:
                issue_date = f'{m.group(1)}-{int(m.group(2)):02d}-{int(m.group(3)):02d}'
        doc_status = 'effective'
        yxx = soup.select_one('span.yxx')
        if yxx:
            text = yxx.get_text(strip=True)
            if '失效' in text or '废止' in text:
                doc_status = 'obsolete'
            elif '修订' in text:
                doc_status = 'amended'
        return ParsedDocument(source_url=item.url, title=item.title, doc_type=doc_type, region_code='CN-GD', content_markdown=content_markdown, content_text=content_text, content_hash=content_hash, content_html=content_html, doc_number=doc_number, issuing_authority=issuing_authority, issue_date=issue_date, doc_status=doc_status, attachments=attachments)