
from bs4 import BeautifulSoup
from common_logging import get_logger

logger = get_logger(__name__)


class HTMLParser:
    HEAD_NOISE_SELECTORS = ['.fontBar', '.font-size-bar', '.share-area', '.shareBox', '.fxzs', '.doc-info-bar', '#fontSizeControl', '.toolbar', 'p.collect', '.collect-btn', '.subscribe-btn', '.share-btn', '.sc', '.dy', '.dy_bg', '.dy_bg01', '.voice-play', '.qrcode-area', '.subscription-tip', '.action-bar', '.social-share', '#shareScript', '#share-1', 'p.tl', "[style*='display: none']", "[style*='display:none']", '.arctips .sets', '.num_fs', '.func', '.arc_date', '.article-qrcode', 'dl.article-qrcode']
    TAIL_NOISE_SELECTORS = ['.printArea', '.print-area', '.error-suggest', '.feedback-section', '.doc-footer-bar', '.print-button', '.download-button', 'p.fontsize', 'div.print', 'div.xxgk-download-box', 'div.item']
    NOISE_SELECTORS = HEAD_NOISE_SELECTORS + TAIL_NOISE_SELECTORS + ['.font-size-selector', '.share-buttons', '#fontSizeControl']
    NOISE_TEXT_PATTERNS = ['字体：【大】【中】【小】', '字体：【大】 【中】 【小】', '分享到：', '全文有效', '【打印】', '【下载】', '纠错或建议', '打印本页', '关闭窗口', '收藏', '订阅', '已推送，请在"个人中心-我的订阅"中查看', '此稿件无标签，进入"订阅设置"中订阅更多', '语音播报：', '扫一扫在手机打开当前页', "$('#share-1').share();", '分享']

    def __init__(self):
        pass

    def parse(self, html_content: str, source_url: str) -> dict:
        soup = BeautifulSoup(html_content, 'lxml')
        metadata = self.extract_metadata(soup)
        content = self.extract_content(soup, source_url)
        attachments = self.extract_attachments(soup, source_url)
        return {'title': metadata.get('title', ''), 'subtitle': metadata.get('subtitle', ''), 'doc_number': metadata.get('doc_number', ''), 'issue_date': metadata.get('issue_date', ''), 'effective_date': metadata.get('effective_date', ''), 'doc_status': metadata.get('doc_status', ''), 'issuing_authority': metadata.get('issuing_authority', ''), 'content_html': content, 'attachments': attachments}

    def extract_metadata(self, soup: BeautifulSoup) -> dict:
        import re
        metadata = {}
        title_elem = soup.find('h3') or soup.find('h1') or soup.find('title')
        if title_elem:
            metadata['title'] = title_elem.get_text(strip=True)
        titlesm = soup.find('div', class_='titlesm')
        if titlesm:
            metadata['subtitle'] = titlesm.get_text(strip=True)
        arc_date = soup.find('p', class_='arc_date')
        if arc_date:
            xg = arc_date.find('span', class_='xg')
            if xg:
                status_text = xg.get_text(strip=True)
                if '全文废止' in status_text or '全文失效' in status_text:
                    metadata['doc_status'] = 'obsolete'
                elif '部分废止' in status_text or '部分失效' in status_text:
                    metadata['doc_status'] = 'partially_obsolete'
                elif '已修订' in status_text or '部分修订' in status_text:
                    metadata['doc_status'] = 'amended'
                elif '全文有效' in status_text:
                    metadata['doc_status'] = 'effective'
            date_span = arc_date.find('span', class_='date')
            if date_span:
                date_text = date_span.get_text(strip=True)
                parsed = self._parse_date(date_text)
                if parsed:
                    metadata['issue_date'] = parsed
        for pattern in [{'class': 'doc-number'}, {'class': 'document-number'}, {'id': 'docNumber'}]:
            elem = soup.find(attrs=pattern)
            if elem:
                metadata['doc_number'] = elem.get_text(strip=True)
                break
        if not metadata.get('doc_number'):
            text = soup.get_text()
            match = re.search('[国财税]\\w{0,10}[发函字号]?〔?\\[?(\\d{4})〕?\\]?第?\\s*(\\d+)\\s*号', text)
            if match:
                metadata['doc_number'] = match.group(0)
        if not metadata.get('issue_date'):
            text = soup.get_text()
            match = re.search('成文日期[：:]\\s*(\\d{4}[-年]\\d{1,2}[-月]\\d{1,2}日?)', text)
            if match:
                metadata['issue_date'] = self._parse_date(match.group(1))
        text = soup.get_text()
        effective_match = re.search('自\\s*(\\d{4})\\s*年\\s*(\\d{1,2})\\s*月\\s*(\\d{1,2})\\s*日\\s*[起以]?\\s*(?:施行|执行|实施|生效)|(\\d{4})\\s*年\\s*(\\d{1,2})\\s*月\\s*(\\d{1,2})\\s*日\\s*起\\s*(?:施行|执行|实施)', text)
        if effective_match:
            g = effective_match.groups()
            y = g[0] or g[3]
            m = g[1] or g[4]
            d = g[2] or g[5]
            metadata['effective_date'] = f'{y}-{m.zfill(2)}-{d.zfill(2)}'
        for pattern in [{'class': 'issuing-authority'}, {'class': 'authority'}, {'id': 'authority'}]:
            elem = soup.find(attrs=pattern)
            if elem:
                metadata['issuing_authority'] = elem.get_text(strip=True)
                break
        return metadata

    def extract_content(self, soup: BeautifulSoup, source_url: str='') -> str:
        for selector in self.NOISE_SELECTORS:
            for elem in soup.select(selector):
                elem.decompose()
        content_elem = soup.find('div', class_='content') or soup.find('div', class_='article-content') or soup.find('div', class_='detail-content') or soup.find('div', class_='newsContent') or soup.find('div', id='content') or soup.find('div', id='articleContent') or soup.find('article') or soup.find('main')
        if not content_elem:
            content_elem = soup.find('body')
        if not content_elem:
            logger.warning('未找到正文内容')
            return ''
        if source_url:
            from urllib.parse import urljoin
            for tag in content_elem.find_all(True):
                for attr in ('href', 'src'):
                    val = tag.get(attr)
                    if val and (not val.startswith(('http://', 'https://', '//', 'javascript:', '#', 'mailto:'))):
                        tag[attr] = urljoin(source_url, val)
        content_html = str(content_elem)
        for pattern in self.NOISE_TEXT_PATTERNS:
            content_html = content_html.replace(pattern, '')
        return content_html

    def extract_attachments(self, soup: BeautifulSoup, source_url: str) -> list[dict]:
        attachments = []
        attachment_section = soup.find('div', class_='attachments') or soup.find('div', class_='attachment-list') or soup.find('div', class_='fjList') or soup.find('div', class_='fjContainer') or soup.find('div', id='attachments')
        SUPPORTED_EXTS = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.wps', '.zip', '.ppt', '.pptx']
        if not attachment_section:
            for elem in soup.find_all(['div', 'section', 'p']):
                text = elem.get_text()
                if '附件' in text and len(text) < 200:
                    if any(any(ext in (a.get('href') or '').lower() for ext in SUPPORTED_EXTS) for a in elem.find_all('a')):
                        attachment_section = elem
                        break

        def _collect_link(link):
            href = link.get('href')
            if href and any(ext in href.lower() for ext in SUPPORTED_EXTS):
                from urllib.parse import urljoin
                full_url = urljoin(source_url, href)
                name = link.get_text(strip=True) or href.split('/')[-1]
                return {'name': name, 'url': full_url, 'type': self._get_file_type(href)}
            return None
        seen_urls = set()
        if attachment_section:
            for link in attachment_section.find_all('a'):
                item = _collect_link(link)
                if item and item['url'] not in seen_urls:
                    attachments.append(item)
                    seen_urls.add(item['url'])
        for link in soup.find_all('a', class_='tinymce-annex'):
            item = _collect_link(link)
            if item and item['url'] not in seen_urls:
                attachments.append(item)
                seen_urls.add(item['url'])
        return attachments

    def extract_media(self, soup: BeautifulSoup, base_url: str='') -> dict:
        from urllib.parse import urljoin
        images, videos, seen = ([], [], set())
        noise_patterns = ['core-plugins', 'css/images', '/logo', 'guohui', 'foot-img', 'down-code', 'header_logo', 'mailbox', 'ENbox']
        for img in soup.find_all('img', src=True):
            src_raw = img['src']
            if src_raw.startswith('data:') or src_raw in seen:
                continue
            src = urljoin(base_url, src_raw) if base_url and (not src_raw.startswith('http')) else src_raw
            if any(p in src for p in noise_patterns):
                continue
            images.append({'src': src, 'src_raw': src_raw, 'alt': img.get('alt', '')})
            seen.add(src_raw)
        for tag in soup.find_all(['video', 'source', 'iframe']):
            src = tag.get('src') or tag.get('data-src', '')
            if not src or src in seen:
                continue
            if base_url and (not src.startswith('http')):
                src = urljoin(base_url, src)
            videos.append({'src': src, 'type': 'mp4' if tag.name in ('video', 'source') else 'iframe'})
            seen.add(src)
        return {'images': images, 'videos': videos}

    def _parse_date(self, date_text: str) -> str | None:
        import re
        date_text = date_text.strip()
        date_text = date_text.translate(str.maketrans('０１２３４５６７８９', '0123456789'))
        match = re.search('(\\d{4})-(\\d{1,2})-(\\d{1,2})', date_text)
        if match:
            return f'{match.group(1)}-{match.group(2).zfill(2)}-{match.group(3).zfill(2)}'
        match = re.search('(\\d{4})年(\\d{1,2})月(\\d{1,2})日?', date_text)
        if match:
            return f'{match.group(1)}-{match.group(2).zfill(2)}-{match.group(3).zfill(2)}'
        return None

    def _get_file_type(self, filename: str) -> str:
        filename_lower = filename.lower()
        if '.pdf' in filename_lower:
            return 'pdf'
        elif '.docx' in filename_lower:
            return 'docx'
        elif '.doc' in filename_lower:
            return 'doc'
        elif '.xlsx' in filename_lower:
            return 'xlsx'
        elif '.xls' in filename_lower:
            return 'xls'
        elif '.wps' in filename_lower:
            return 'wps'
        elif '.ppt' in filename_lower or '.pptx' in filename_lower:
            return 'ppt'
        else:
            return 'unknown'
