import re

import httpx

from app.models.tax_data import TaxDocument
from common_logging import get_logger
logger = get_logger(__name__)



_DOC_NUM_RE = re.compile('[\\u4e00-\\u9fa5]{2,10}[〔\\[\\（(](\\d{4})[〕\\]\\）)]第?(\\d+)号|(?:国务院令|主席令|国务院关税税则委员会令|全国人民代表大会常务委员会公告)第(\\d+)号')

def _parse_doc_number_parts(doc_number: str | None) -> dict:
    if not doc_number:
        return {'doc_number_year': None, 'doc_number_serial': None}
    m = _DOC_NUM_RE.search(doc_number)
    if m:
        if m.group(1):
            return {'doc_number_year': int(m.group(1)), 'doc_number_serial': int(m.group(2))}
        if m.group(3):
            return {'doc_number_year': None, 'doc_number_serial': int(m.group(3))}
    return {'doc_number_year': None, 'doc_number_serial': None}

class KnowledgeBaseClient:

    def __init__(self, base_url: str, api_key: str | None=None, timeout: int=60):
        self.base_url = base_url.rstrip('/')
        self.api_key = api_key
        self.timeout = timeout

    def _get_headers(self) -> dict:
        headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}
        if self.api_key:
            headers['Authorization'] = f'Bearer {self.api_key}'
        return headers

    async def create_document(self, category_id: int | None, title: str, content: str, source_url: str | None=None, doc_number: str | None=None, doc_number_year: int | None=None, doc_number_serial: int | None=None, issuing_authority: str | None=None, issue_date: str | None=None, effective_date: str | None=None, expire_date: str | None=None, doc_status: str | None=None, supersedes_doc_ids: list[int] | None=None, tax_type_tags: list[str] | None=None, has_attachment: bool=False, attachment_types: list[str] | None=None, parse_quality_score: float | None=None, content_hash: str | None=None, version_number: int=1, doc_type: str | None=None, data_center_doc_id: int | None=None) -> dict | None:
        url = f'{self.base_url}/api/v1/knowledge/documents'
        summary = content[:200] + '...' if content and len(content) > 200 else content or ''
        payload: dict = {'title': title, 'content': content, 'summary': summary, 'category_id': category_id, 'source': '国家税务总局', 'reference_url': source_url, 'status': 'published', 'is_public': True, 'segmentation_mode': 'automatic', 'doc_type': doc_type or 'regulation'}
        if doc_number is not None:
            payload['doc_number'] = doc_number
        if issuing_authority is not None:
            payload['issuing_authority'] = issuing_authority
        if doc_number_year is not None:
            payload['doc_number_year'] = doc_number_year
        if doc_number_serial is not None:
            payload['doc_number_serial'] = doc_number_serial
        if issue_date is not None:
            payload['issue_date'] = issue_date
        if effective_date is not None:
            payload['effective_date'] = effective_date
        if expire_date is not None:
            payload['expire_date'] = expire_date
        if doc_status is not None:
            payload['doc_status'] = doc_status
        if supersedes_doc_ids is not None:
            payload['supersedes_doc_ids'] = supersedes_doc_ids
        if tax_type_tags is not None:
            payload['tax_type_tags'] = tax_type_tags
        payload['has_attachment'] = has_attachment
        if attachment_types is not None:
            payload['attachment_types'] = attachment_types
        if parse_quality_score is not None:
            payload['parse_quality_score'] = parse_quality_score
        if content_hash is not None:
            payload['content_hash'] = content_hash
        payload['version_number'] = version_number
        if data_center_doc_id is not None:
            payload['data_center_doc_id'] = data_center_doc_id
        try:
            async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client:
                response = await client.post(url, json=payload, headers=self._get_headers())
                response.raise_for_status()
                result = response.json()
                logger.info(f"文档创建成功: {title}, ID: {result.get('id')}")
                return {'success': True, 'document_id': result.get('id'), 'data': result}
        except httpx.HTTPStatusError as e:
            logger.error(f'创建文档失败 (HTTP {e.response.status_code}): {e.response.text}')
            return {'success': False, 'error': f'HTTP {e.response.status_code}: {e.response.text}'}
        except Exception as e:
            logger.error(f'创建文档失败: {e}'.opt(exception=True))
            return {'success': False, 'error': str(e)}

    async def trigger_vectorization(self, document_id: int) -> dict | None:
        url = f'{self.base_url}/api/v1/knowledge/documents/{document_id}/vectorize'
        try:
            async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client:
                response = await client.post(url, headers=self._get_headers())
                response.raise_for_status()
                result = response.json()
                logger.info(f'向量化任务已启动: document_id={document_id}')
                return {'success': True, 'data': result}
        except httpx.HTTPStatusError as e:
            logger.error(f'触发向量化失败 (HTTP {e.response.status_code}): {e.response.text}')
            return {'success': False, 'error': f'HTTP {e.response.status_code}: {e.response.text}'}
        except Exception as e:
            logger.error(f'触发向量化失败: {e}'.opt(exception=True))
            return {'success': False, 'error': str(e)}

    async def trigger_graph_build(self, kb_id: int, document_ids: list[int] | None=None) -> dict | None:
        url = f'{self.base_url}/api/v1/knowledge/bases/{kb_id}/graph/build'
        payload: dict = {'rebuild': True}
        if document_ids:
            payload['document_ids'] = document_ids
        try:
            async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client:
                response = await client.post(url, json=payload, headers=self._get_headers())
                response.raise_for_status()
                result = response.json()
                logger.info(f'图谱构建任务已启动: kb_id={kb_id}')
                return {'success': True, 'data': result}
        except httpx.HTTPStatusError as e:
            logger.error(f'触发图谱构建失败 (HTTP {e.response.status_code}): {e.response.text}')
            return {'success': False, 'error': f'HTTP {e.response.status_code}: {e.response.text}'}
        except Exception as e:
            logger.error(f'触发图谱构建失败: {e}'.opt(exception=True))
            return {'success': False, 'error': str(e)}

    async def upload_file(self, file_path: str, file_name: str, kb_id: int) -> dict | None:
        url = f'{self.base_url}/api/v1/knowledge/documents/upload'
        try:
            with open(file_path, 'rb') as f:
                files = {'file': (file_name, f)}
                data = {'kb_id': kb_id}
                async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client:
                    response = await client.post(url, files=files, data=data, headers={'Authorization': f'Bearer {self.api_key}'} if self.api_key else {})
                    response.raise_for_status()
                    result = response.json()
                    logger.info(f'文件上传成功: {file_name}')
                    return {'success': True, 'file_path': result.get('file_path'), 'data': result}
        except httpx.HTTPStatusError as e:
            logger.error(f'上传文件失败 (HTTP {e.response.status_code}): {e.response.text}')
            return {'success': False, 'error': f'HTTP {e.response.status_code}: {e.response.text}'}
        except Exception as e:
            logger.error(f'上传文件失败: {e}'.opt(exception=True))
            return {'success': False, 'error': str(e)}

    async def import_document(self, tax_doc: TaxDocument, kb_id: int=1, category_id: int | None=None) -> dict | None:
        logger.info(f'开始导入文档: {tax_doc.title}')
        try:
            num_parts = _parse_doc_number_parts(tax_doc.doc_number)
            full_content = tax_doc.content_text or tax_doc.content_markdown or ''
            attachments: list[dict] = tax_doc.attachments or []
            has_att = bool(attachments)
            att_types: list[str] | None = list({a.get('type', '').lower() for a in attachments if a.get('type')}) if has_att else None
            create_result = await self.create_document(category_id=category_id, title=tax_doc.title, content=full_content, source_url=tax_doc.source_url, doc_number=tax_doc.doc_number, doc_number_year=num_parts['doc_number_year'], doc_number_serial=num_parts['doc_number_serial'], issuing_authority=tax_doc.issuing_authority, issue_date=tax_doc.issue_date.isoformat() if tax_doc.issue_date else None, effective_date=tax_doc.effective_date.isoformat() if tax_doc.effective_date else None, doc_status='effective', tax_type_tags=None, has_attachment=has_att, attachment_types=att_types, parse_quality_score=None, content_hash=tax_doc.content_hash, version_number=1, doc_type='regulation', data_center_doc_id=tax_doc.id)
            if not create_result or not create_result.get('success'):
                return create_result
            document_id = create_result['document_id']
            meta: dict = {}
            if tax_doc.doc_number:
                meta['doc_number'] = tax_doc.doc_number
            if tax_doc.issuing_authority:
                meta['issuing_authority'] = tax_doc.issuing_authority
            if tax_doc.issue_date:
                meta['issue_date'] = tax_doc.issue_date.isoformat()
            if tax_doc.effective_date:
                meta['effective_date'] = tax_doc.effective_date.isoformat()
            if tax_doc.source_url:
                meta['source_url'] = tax_doc.source_url
            if tax_doc.category_id is not None:
                meta['tax_category_id'] = str(tax_doc.category_id)
            if meta:
                await self.annotate_document_metadata(document_id, meta)
            vectorize_result = await self.trigger_vectorization(document_id=document_id)
            if not vectorize_result or not vectorize_result.get('success'):
                logger.warning(f'向量化失败: {vectorize_result}')
            graph_result = await self.trigger_graph_build(kb_id=kb_id, document_ids=[document_id])
            if not graph_result or not graph_result.get('success'):
                logger.warning(f'图谱构建失败（非致命）: {graph_result}')
            logger.info(f'文档导入完成: {tax_doc.title}, knowledge_doc_id={document_id}')
            return {'success': True, 'knowledge_doc_id': document_id}
        except Exception as e:
            logger.error(f'导入文档失败: {e}'.opt(exception=True))
            return {'success': False, 'error': str(e)}
    _TAX_METADATA_FIELDS = [{'field_name': '文号', 'field_key': 'doc_number', 'field_type': 'text'}, {'field_name': '发文机关', 'field_key': 'issuing_authority', 'field_type': 'text'}, {'field_name': '成文日期', 'field_key': 'issue_date', 'field_type': 'date'}, {'field_name': '生效日期', 'field_key': 'effective_date', 'field_type': 'date'}, {'field_name': '文档状态', 'field_key': 'doc_status', 'field_type': 'select', 'field_options': ['effective', 'obsolete', 'partially_obsolete', 'amended', 'expired']}, {'field_name': '被废止文号', 'field_key': 'superseded_by', 'field_type': 'text'}, {'field_name': '被废止文档标题', 'field_key': 'superseded_by_title', 'field_type': 'text'}, {'field_name': '来源链接', 'field_key': 'source_url', 'field_type': 'text'}, {'field_name': '税务分类ID', 'field_key': 'tax_category_id', 'field_type': 'number'}]

    async def ensure_metadata_fields(self, kb_id: int) -> dict[str, int]:
        get_url = f'{self.base_url}/api/v1/knowledge/bases/{kb_id}/metadata-fields'
        post_url = get_url
        async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client:
            resp = await client.get(get_url, headers=self._get_headers())
            resp.raise_for_status()
            existing = {f['field_key']: f['id'] for f in resp.json() if not f.get('is_system')}
            field_key_to_id: dict[str, int] = dict(existing)
            for field_def in self._TAX_METADATA_FIELDS:
                if field_def['field_key'] in existing:
                    continue
                r = await client.post(post_url, json=field_def, headers=self._get_headers())
                r.raise_for_status()
                created = r.json()
                field_key_to_id[field_def['field_key']] = created['id']
                logger.info(f"创建元数据字段: {field_def['field_key']} → id={created['id']}")
        return field_key_to_id

    async def annotate_document_metadata(self, document_id: int, metadata_dict: dict[str, str]) -> None:
        url = f'{self.base_url}/api/v1/knowledge/documents/{document_id}/metadata'
        try:
            async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=True) as client:
                resp = await client.put(url, json={'metadata': metadata_dict}, headers=self._get_headers())
                resp.raise_for_status()
                logger.info(f'文档元数据标注成功: document_id={document_id}')
        except Exception as e:
            logger.warning(f'文档元数据标注失败（非致命）: document_id={document_id} error={e}')

    async def get_document_status(self, document_id: int) -> dict | None:
        url = f'{self.base_url}/api/v1/knowledge/documents/{document_id}'
        try:
            async with httpx.AsyncClient(timeout=self.timeout) as client:
                response = await client.get(url, headers=self._get_headers())
                response.raise_for_status()
                result = response.json()
                return {'success': True, 'data': result}
        except httpx.HTTPStatusError as e:
            logger.error(f'获取文档状态失败 (HTTP {e.response.status_code}): {e.response.text}')
            return {'success': False, 'error': f'HTTP {e.response.status_code}: {e.response.text}'}
        except Exception as e:
            logger.error(f'获取文档状态失败: {e}'.opt(exception=True))
            return {'success': False, 'error': str(e)}

    async def update_document_content(self, knowledge_doc_id: int, new_content: str) -> dict | None:
        url = f'{self.base_url}/api/v1/knowledge/documents/{knowledge_doc_id}/revectorize'
        try:
            async with httpx.AsyncClient(timeout=self.timeout) as client:
                response = await client.post(url, json={'new_content': new_content}, headers=self._get_headers())
                response.raise_for_status()
                result = response.json()
                logger.info(f"[revectorize] knowledge_doc_id={knowledge_doc_id} unchanged={result.get('unchanged')} changed={result.get('changed')} added={result.get('added')}")
                return result
        except httpx.HTTPStatusError as e:
            logger.error(f'增量重向量化失败 (HTTP {e.response.status_code}): {e.response.text}')
            return {'success': False, 'error': f'HTTP {e.response.status_code}: {e.response.text}'}
        except Exception as e:
            logger.error(f'增量重向量化失败: {e}'.opt(exception=True))
            return {'success': False, 'error': str(e)}
