import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[3] / 'base_platform'))
from app.models import role, role_permission, user, tenant, knowledge_base
import zipfile
import tempfile
from typing import List, Optional
from sqlalchemy import text
from app.db.session import SessionLocal
from app.models.knowledge_base import KnowledgeBase, KnowledgeCategory, KnowledgeDocument
from app.services.knowledge.file_parser import get_file_parser_service

def seed_tax_regulations(tenant_id: int, user_id: int=None, years: Optional[List[int]]=None):
    db = SessionLocal()
    parser = get_file_parser_service()
    try:
        db.execute(text(f'SET search_path TO tenant_{tenant_id}, public'))
        kb = db.query(KnowledgeBase).filter_by(type='tax_regulations', tenant_id=tenant_id).first()
        if not kb:
            print(f'  ✗ Tax regulations KB not found for tenant {tenant_id}')
            return
        docs_base = Path(__file__).parent / 'knowledge/documents/regulations/2026&2023-1984'
        if not docs_base.exists():
            print(f'  - No documents directory found, skipping import')
            return
        total_imported = 0
        for year_dir in sorted(docs_base.iterdir()):
            if not year_dir.is_dir():
                continue
            year = year_dir.name
            if years is not None:
                try:
                    year_int = int(year)
                    if year_int not in years:
                        continue
                except ValueError:
                    continue
            category = db.query(KnowledgeCategory).filter_by(knowledge_base_id=kb.id, name=f'{year}年').first()
            if not category:
                category = KnowledgeCategory(knowledge_base_id=kb.id, name=f'{year}年', description=f'{year}年税法文档', sort_order=int(year) if year.isdigit() else 0)
                db.add(category)
                db.flush()
            year_count = 0
            for zip_file in year_dir.glob('*.zip'):
                try:
                    with tempfile.TemporaryDirectory() as tmpdir:
                        with zipfile.ZipFile(zip_file, 'r') as zf:
                            zf.extractall(tmpdir)
                        for file_path in Path(tmpdir).rglob('*'):
                            if not file_path.is_file():
                                continue
                            ext = file_path.suffix.lower().lstrip('.')
                            if ext not in ['pdf', 'doc', 'docx', 'txt', 'md', 'html']:
                                continue
                            try:
                                content = parser.parse_file(file_path.read_bytes(), ext)
                                if not content:
                                    continue
                                existing_doc = db.query(KnowledgeDocument).filter_by(title=file_path.stem, category_id=category.id, tenant_id=tenant_id).first()
                                if existing_doc:
                                    continue
                                doc = KnowledgeDocument(title=file_path.stem, content=content, summary=content[:200], category_id=category.id, file_type=ext, status='published', tenant_id=tenant_id, author_id=user_id, segmentation_mode='automatic', chunk_size=1000, chunk_overlap=200, splitter_type='tax_article', character_count=len(content), is_vectorized=False, vectorization_status='pending')
                                db.add(doc)
                                year_count += 1
                            except Exception as e:
                                print(f'    ⚠ Failed to parse {file_path.name}: {e}')
                                continue
                except Exception as e:
                    print(f'    ⚠ Failed to process {zip_file.name}: {e}')
                    continue
            if year_count > 0:
                db.commit()
                total_imported += year_count
                print(f'  ✓ Imported {year_count} documents for {year}')
        if total_imported > 0:
            print(f'  ✓ Total imported: {total_imported} documents')
        else:
            print(f'  - No documents imported')
    except Exception as e:
        db.rollback()
        print(f'  ✗ Failed to import regulations: {e}')
        raise
    finally:
        db.close()

def vectorize_imported_documents(tenant_id: int, user_id: int=None, batch_size: int=10):
    from app.services.knowledge.vectorization_service import DocumentVectorizationService
    db = SessionLocal()
    vectorization_service = DocumentVectorizationService(db)
    try:
        db.execute(text(f'SET search_path TO tenant_{tenant_id}, public'))
        if not user_id:
            from app.models.user import User
            admin = db.query(User).filter_by(email='admin@hellotax.cn').first()
            if admin:
                user_id = admin.id
            else:
                raise ValueError('user_id is required and no platform admin found')
        pending_docs = db.query(KnowledgeDocument).filter_by(tenant_id=tenant_id, vectorization_status='pending').all()
        if not pending_docs:
            print('  - No documents to vectorize')
            return
        total = len(pending_docs)
        print(f'  → Vectorizing {total} documents...')
        success_count = 0
        failed_count = 0
        kb = db.query(KnowledgeBase).filter_by(type='tax_regulations', tenant_id=tenant_id).first()
        model_id = None
        if kb and kb.code:
            from app.models.provider import Model
            if kb.code.isdigit():
                model = db.query(Model).filter(Model.id == int(kb.code)).first()
            else:
                model = db.query(Model).filter(Model.code == kb.code).first()
            if model:
                model_id = model.id
                print(f'  → Using embedding model ID: {model_id}')
        for i, doc in enumerate(pending_docs, 1):
            try:
                vectorization_service.vectorize_document(doc.id, tenant_id=tenant_id, user_id=user_id, model_id=model_id)
                success_count += 1
                if i % batch_size == 0 or i == total:
                    print(f'    Progress: {i}/{total} ({success_count} success, {failed_count} failed)')
            except Exception as e:
                failed_count += 1
                print(f'    ⚠ Failed to vectorize document {doc.id}: {e}')
                continue
        print(f'  ✓ Vectorization completed: {success_count} success, {failed_count} failed')
    except Exception as e:
        print(f'  ✗ Vectorization failed: {e}')
        raise
    finally:
        db.close()

def auto_tag_imported_documents(tenant_id: int):
    from app.services.knowledge.auto_tagging import auto_tag_document_on_upload
    from app.models.knowledge_base import DocumentTag
    db = SessionLocal()
    try:
        db.execute(text(f'SET search_path TO tenant_{tenant_id}, public'))
        untagged_docs = db.query(KnowledgeDocument).outerjoin(DocumentTag, KnowledgeDocument.id == DocumentTag.document_id).filter(KnowledgeDocument.tenant_id == tenant_id, DocumentTag.document_id.is_(None)).all()
        if not untagged_docs:
            print('  - No documents to tag')
            return
        total = len(untagged_docs)
        print(f'  → Auto-tagging {total} documents...')
        success_count = 0
        failed_count = 0
        for i, doc in enumerate(untagged_docs, 1):
            try:
                auto_tag_document_on_upload(db, doc.id)
                success_count += 1
                if i % 10 == 0 or i == total:
                    print(f'    Progress: {i}/{total} ({success_count} success, {failed_count} failed)')
            except Exception as e:
                failed_count += 1
                print(f'    ⚠ Failed to tag document {doc.id}: {e}')
                continue
        db.commit()
        print(f'  ✓ Auto-tagging completed: {success_count} success, {failed_count} failed')
    except Exception as e:
        db.rollback()
        print(f'  ✗ Auto-tagging failed: {e}')
        raise
    finally:
        db.close()