import hashlib
import re
import unicodedata

from common_logging import get_logger

logger = get_logger(__name__)

_RESERVED_NAMES = frozenset(['CON', 'PRN', 'AUX', 'NUL'] + [f'COM{i}' for i in range(1, 10)] + [f'LPT{i}' for i in range(1, 10)])
_ILLEGAL_CHARS_RE = re.compile('[<>:"/\\\\|?*\\x00-\\x1f\\x7f]')
_ZERO_WIDTH_RE = re.compile('[\u200b-\u200f\u202a-\u202e\u2060-\u2064\ufeff\xad]')
_MAX_BYTES = 200
_CATEGORY_RULES = {1: 'title_only', 2: 'title_date', 3: 'title_docno_date', 4: 'title_subtitle', 5: 'title_docno_date', 6: 'title_docno_date', 7: 'title_docno_date', 8: 'title_docno_date'}

def safe_filename(title: str, category_id: int, *, doc_number: str | None=None, compose_date: str | None=None, sub_title: str | None=None, extension: str='.md') -> str:
    rule = _CATEGORY_RULES.get(category_id, 'title_docno_date')
    clean_title = _sanitize(title)
    clean_docno = _sanitize(doc_number) if doc_number else ''
    clean_date = _normalize_date(compose_date) if compose_date else ''
    clean_subtitle = _sanitize(sub_title) if sub_title else ''
    if rule == 'title_only':
        parts = [clean_title]
    elif rule == 'title_date':
        parts = [p for p in [clean_title, clean_date] if p]
    elif rule == 'title_subtitle':
        parts = [p for p in [clean_title, clean_subtitle] if p]
    else:
        parts = [p for p in [clean_title, clean_docno, clean_date] if p]
    stem = '_'.join(parts) if parts else 'untitled'
    if stem.upper() in _RESERVED_NAMES:
        stem = f'_{stem}'
    stem = _truncate_to_bytes(stem, _MAX_BYTES - len(extension.encode()))
    return stem + extension

def _sanitize(text: str) -> str:
    if not text:
        return ''
    text = unicodedata.normalize('NFC', text)
    text = _ZERO_WIDTH_RE.sub('', text)
    text = _ILLEGAL_CHARS_RE.sub('_', text)
    text = re.sub('[\\s_]+', '_', text)
    return text.strip('_').strip()

def _normalize_date(date_str: str) -> str:
    digits = re.sub('\\D', '', date_str)
    if len(digits) == 8:
        return digits
    if len(digits) >= 6:
        return digits[:8]
    return date_str

def _truncate_to_bytes(text: str, max_bytes: int) -> str:
    encoded = text.encode('utf-8')
    if len(encoded) <= max_bytes:
        return text
    truncated = encoded[:max_bytes]
    return truncated.decode('utf-8', errors='ignore')

def generate_directory_name(issue_date: str | None, source_url: str) -> str:
    hash8 = hashlib.sha256(source_url.encode('utf-8')).hexdigest()[:8]
    date_str = _normalize_date(issue_date) if issue_date else '00000000'
    if not (len(date_str) == 8 and date_str.isdigit()):
        date_str = '00000000'
    return f'{hash8}_{date_str}'
