from common_logging import get_logger

from app.services.tax_data_processor.filename_utils import generate_directory_name, safe_filename

logger = get_logger(__name__)


class MetadataExtractor:

    def __init__(self):
        pass

    def extract(self, title: str, content: str, category_id: int) -> dict:
        content = content.translate(str.maketrans('０１２３４５６７８９', '0123456789'))
        issue_date = self._extract_issue_date(content)
        metadata = {'title': self._clean_title(title), 'doc_number': self._extract_doc_number(title, content), 'issue_date': issue_date, 'issuing_authority': self._extract_issuing_authority(title, content), 'effective_date': self._extract_effective_date(content, issue_date)}
        if not metadata['doc_number'] and not metadata['issue_date']:
            logger.warning("metadata extraction yielded no doc_number or issue_date", title=title, category_id=category_id)
        return metadata

    def _clean_title(self, title: str) -> str:
        title = ' '.join(title.split())
        noise_patterns = ['- 中国政府网', '- 国家税务总局', '_中华人民共和国中央人民政府']
        for pattern in noise_patterns:
            title = title.replace(pattern, '')
        return title.strip()
    _DOC_NUM_PATTERNS = ['(?P<prefix>[一-龥][一-龥 ]{3,60}(?:公告|通告))\\s*(?P<year>\\d{4})年第(?P<serial>\\d+)号', '(?P<prefix>[一-龥][一-龥 ]{3,80})[〔\\[\\（(](?P<year>\\d{4})[〕\\]\\）)]第?\\s*(?P<serial>\\d+)\\s*号', '(?P<prefix>[一-龥]{2,10})[〔\\[\\（(](?P<year>\\d{4})[〕\\]\\）)]第?\\s*(?P<serial>\\d+)\\s*号', '(?P<prefix>(?!日)(?:国务院令|主席令|国务院关税税则委员会令|[一-龥]{2,25}部令|[一-龥]{2,25}委(?:员会)?令|[一-龥]{2,25}局令))第?(?P<serial>\\d+)号', '(?P<prefix>全国人民代表大会常务委员会公告)第(?P<serial>\\d+)号', '(?P<prefix>[一-龥]{2,50}公告)(?P<year>\\d{4})年第(?P<serial>\\d+)号', '[（(](?P<year>\\d{2,4})[）)](?P<prefix>[一-龥〔〕\\[\\]]{2,15}字)第(?P<serial>\\d+)号', '[〔](?P<year>\\d{2,4})[〕](?P<prefix>[一-龥〔〕\\[\\]]{2,15}字)第(?P<serial>\\d+)号', '\\\\?[\\[（(](?P<year>\\d{2,4})\\\\?[\\]）)](?P<prefix>[一-龥]{2,10})(?P<serial>\\d+)号', '(?P<prefix>[一-龥][一-龥 ]{3,60}(?:公告|通告))\\s*(?P<year>\\d{4})年(?P<serial>\\d+)号', '(?P<prefix>[一-龥]{2,20}公告)(?P<year>\\d{4})年(?P<serial>\\d+)号', '(?P<prefix>[一-龥、，,]{4,60}令)(?P<year>\\d{4})年第(?P<serial>\\d+)号', '(?P<prefix>(?:国务院令|主席令|[一-龥]{2,25}(?:部|委(?:员会)?|局)令))\\s*\\n\\s*第(?P<serial>\\d+)号', '\\\\?[（(](?P<year>\\d{2,4})\\\\[）)](?P<prefix>[一-龥〔〕\\[\\]]{2,15}字)第(?P<serial>\\d+)号']

    def _extract_doc_number(self, title: str, content: str) -> str | None:
        import re
        for text in (title, content[:300], content[:800]):
            for pat in self._DOC_NUM_PATTERNS:
                m = re.search(pat, text)
                if m:
                    result = m.group(0)
                    result = re.sub('^[日月年]+', '', result)
                    result = re.sub('[\\n\\r\\t]+', ' ', result).strip()
                    result = re.sub('(?<=[令局部会])\\s+(?=第)', '', result)
                    return result
        return None

    def parse_doc_number(self, doc_number: str) -> dict:
        import re
        if not doc_number:
            return {'year': None, 'serial': None, 'prefix': None}
        for pat in self._DOC_NUM_PATTERNS:
            m = re.search(pat, doc_number)
            if m:
                gd = m.groupdict()
                return {'year': int(gd['year']) if gd.get('year') else None, 'serial': int(gd['serial']) if gd.get('serial') else None, 'prefix': gd.get('prefix')}
        return {'year': None, 'serial': None, 'prefix': None}

    def _extract_issue_date(self, content: str) -> str | None:
        import re
        patterns = ['成文日期[：:]\\s*(\\d{4})[-年](\\d{1,2})[-月](\\d{1,2})日?', '发布日期[：:]\\s*(\\d{4})[-年](\\d{1,2})[-月](\\d{1,2})日?']
        for pattern in patterns:
            match = re.search(pattern, content[:1000])
            if match:
                year = match.group(1)
                month = match.group(2).zfill(2)
                day = match.group(3).zfill(2)
                return f'{year}-{month}-{day}'
        return None

    def _extract_issuing_authority(self, title: str, content: str) -> str | None:
        import re
        m = re.match('^([一-龥][一-龥\\s]{1,58}?)\\s*关于', title)
        if m:
            return m.group(1).strip()
        m = re.search('^([一-龥\\s]{4,60}?(?:部|局|委|会|署))', content[:200])
        if m:
            return m.group(1).strip()
        return None

    def _extract_effective_date(self, content: str, issue_date: str | None=None) -> str | None:
        import re
        label_match = re.search('生效日期[：:]\\s*(\\d{4})[-年](\\d{1,2})[-月](\\d{1,2})日?', content)
        if label_match:
            return f'{label_match.group(1)}-{label_match.group(2).zfill(2)}-{label_match.group(3).zfill(2)}'
        specific_patterns = ['自\\s*(\\d{4})\\s*年\\s*(\\d{1,2})\\s*月\\s*(\\d{1,2})\\s*日\\s*起?\\s*(?:施行|执行|实施|生效)', '(?<!自)(\\d{4})\\s*年\\s*(\\d{1,2})\\s*月\\s*(\\d{1,2})\\s*日\\s*起\\s*(?:施行|执行|实施|生效)']
        matches = []
        for pattern in specific_patterns:
            for match in re.finditer(pattern, content):
                snippet = content[max(0, match.start() - 16):match.end() + 32]
                if re.search('(?:全文|部分|条款)?\\s*(?:废止|失效)', snippet):
                    continue
                matches.append(match)
        if matches:
            match = matches[-1]
            return f'{match.group(1)}-{match.group(2).zfill(2)}-{match.group(3).zfill(2)}'
        if issue_date and re.search('(?:发布|印发|公布|颁布|起草)\\s*之日\\s*起?\\s*(?:施行|执行|实施|生效)', content):
            return issue_date
        return None

    def generate_filename(self, title: str, doc_number: str | None, issue_date: str | None, category_id: int=0, sub_title: str | None=None, extension: str='.md') -> str:
        return safe_filename(title=title, category_id=category_id, doc_number=doc_number, compose_date=issue_date, sub_title=sub_title, extension=extension)

    def generate_directory_name(self, issue_date: str | None, source_url: str) -> str:
        return generate_directory_name(issue_date, source_url)
