import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / 'base_platform'))
import argparse
from typing import List
from sqlalchemy import text
from app.db.session import SessionLocal
from app.models.knowledge_base import KnowledgeBase, KnowledgeDocument
from app.models.user import User
from .seed.taxonomy import seed_taxonomy
from .seed.regulations import seed_tax_regulations
from .seed.knowledge import init_tax_knowledge_base

def batch_vectorize_documents(tenant_id: int, user_id: int):
    from app.services.knowledge.vectorization_service import DocumentVectorizationService
    db = SessionLocal()
    try:
        db.execute(text(f'SET search_path TO tenant_{tenant_id}, public'))
        kb = db.query(KnowledgeBase).filter_by(type='tax_regulations', tenant_id=tenant_id).first()
        if not kb:
            print(f'  ✗ Tax regulations KB not found')
            return 0
        docs = db.query(KnowledgeDocument).filter(KnowledgeDocument.tenant_id == tenant_id, KnowledgeDocument.is_vectorized == False, KnowledgeDocument.vectorization_status == 'pending').all()
        if not docs:
            print(f'  - No pending documents to vectorize')
            return 0
        print(f'  Found {len(docs)} documents to vectorize')
        service = DocumentVectorizationService(db)
        success_count = 0
        error_count = 0
        for i, doc in enumerate(docs, 1):
            try:
                print(f'  [{i}/{len(docs)}] Vectorizing: {doc.title[:50]}...')
                service.vectorize_document(document_id=doc.id, chunk_strategy='tax_article', chunk_size=1000, chunk_overlap=200, tenant_id=tenant_id, user_id=user_id)
                success_count += 1
            except Exception as e:
                error_count += 1
                print(f'    ⚠ Failed: {e}')
        print(f'  ✓ Vectorized {success_count} documents ({error_count} errors)')
        return success_count
    except Exception as e:
        print(f'  ✗ Vectorization failed: {e}')
        raise
    finally:
        db.close()

def batch_auto_tag_documents(tenant_id: int):
    from app.services.knowledge.auto_tagging import auto_tag_document_on_upload
    db = SessionLocal()
    try:
        db.execute(text(f'SET search_path TO tenant_{tenant_id}, public'))
        docs = db.query(KnowledgeDocument).filter(KnowledgeDocument.tenant_id == tenant_id).all()
        if not docs:
            print(f'  - No documents to tag')
            return 0
        print(f'  Found {len(docs)} documents to tag')
        success_count = 0
        error_count = 0
        for i, doc in enumerate(docs, 1):
            try:
                if i % 100 == 0:
                    print(f'  [{i}/{len(docs)}] Tagging documents...')
                tag_ids = auto_tag_document_on_upload(db=db, document_id=doc.id, confidence_threshold=0.5, max_tags=5, min_tags=3)
                if tag_ids:
                    success_count += 1
            except Exception as e:
                error_count += 1
                if error_count <= 5:
                    print(f'    ⚠ Failed to tag doc {doc.id}: {e}')
        print(f'  ✓ Tagged {success_count} documents ({error_count} errors)')
        return success_count
    except Exception as e:
        print(f'  ✗ Auto-tagging failed: {e}')
        raise
    finally:
        db.close()

def generate_report(tenant_id: int):
    db = SessionLocal()
    try:
        db.execute(text(f'SET search_path TO tenant_{tenant_id}, public'))
        tag_categories = db.execute(text('SELECT COUNT(*) FROM tag_categories')).scalar()
        tags = db.execute(text('SELECT COUNT(*) FROM knowledge_tags')).scalar()
        total_docs = db.execute(text('SELECT COUNT(*) FROM knowledge_documents')).scalar()
        vectorized_docs = db.execute(text('SELECT COUNT(*) FROM knowledge_documents WHERE is_vectorized = true')).scalar()
        tagged_docs = db.execute(text('\n            SELECT COUNT(DISTINCT document_id) FROM document_tags\n        ')).scalar()
        total_tag_assignments = db.execute(text('SELECT COUNT(*) FROM document_tags')).scalar()
        docs_by_year = db.execute(text('\n            SELECT kc.name, COUNT(kd.id) as count\n            FROM knowledge_categories kc\n            LEFT JOIN knowledge_documents kd ON kd.category_id = kc.id\n            GROUP BY kc.name\n            HAVING COUNT(kd.id) > 0\n            ORDER BY kc.name\n        ')).fetchall()
        print(f"\n{'=' * 60}")
        print(f'VERIFICATION REPORT - Tenant {tenant_id}')
        print(f"{'=' * 60}\n")
        print(f'📊 TAG SYSTEM')
        print(f'  • Tag categories: {tag_categories}')
        print(f'  • Total tags: {tags}')
        print(f'\n📚 DOCUMENTS')
        print(f'  • Total documents: {total_docs}')
        print(f'  • Vectorized: {vectorized_docs} ({(vectorized_docs * 100 // total_docs if total_docs > 0 else 0)}%)')
        print(f'  • Years covered: {len(docs_by_year)}')
        print(f'\n🏷️  AUTO-TAGGING')
        print(f'  • Documents with tags: {tagged_docs} ({(tagged_docs * 100 // total_docs if total_docs > 0 else 0)}%)')
        print(f'  • Total tag assignments: {total_tag_assignments}')
        print(f'  • Avg tags per document: {(total_tag_assignments / tagged_docs if tagged_docs > 0 else 0):.1f}')
        print(f'\n📅 DOCUMENTS BY YEAR')
        for year, count in docs_by_year:
            print(f'  • {year}: {count} documents')
        print(f"\n{'=' * 60}\n")
    finally:
        db.close()

def test_full_workflow(tenant_id: int, user_id: int, years: List[int]):
    print(f"\n{'=' * 60}")
    print(f'TAX DATA FULL WORKFLOW TEST')
    print(f"{'=' * 60}\n")
    print(f'Tenant ID: {tenant_id}')
    print(f'User ID: {user_id}')
    print(f"Years: {', '.join(map(str, years))}\n")
    print(f'[1/5] Creating knowledge bases...')
    init_tax_knowledge_base(tenant_id, user_id)
    print(f'\n[2/5] Importing tax tags...')
    seed_taxonomy(tenant_id)
    print(f'\n[3/5] Importing documents for specified years...')
    seed_tax_regulations(tenant_id, user_id, years=years)
    print(f'\n[4/5] Vectorizing documents...')
    vectorized_count = batch_vectorize_documents(tenant_id, user_id)
    print(f'\n[5/5] Auto-tagging documents...')
    tagged_count = batch_auto_tag_documents(tenant_id)
    generate_report(tenant_id)
    print(f'✅ Workflow completed successfully!\n')
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Test complete tax data workflow: import, vectorize, and auto-tag')
    parser.add_argument('--tenant-id', type=int, required=True, help='Tenant ID')
    parser.add_argument('--user-id', type=int, help='User ID (defaults to platform admin)')
    parser.add_argument('--years', nargs='+', type=int, default=[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2026], help='Years to import (default: 2010-2023, 2026)')
    args = parser.parse_args()
    user_id = args.user_id
    if not user_id:
        db = SessionLocal()
        admin = db.query(User).filter_by(email='admin@hellotax.cn').first()
        if admin:
            user_id = admin.id
        db.close()
    if not user_id:
        print('Error: Could not find platform admin user')
        sys.exit(1)
    test_full_workflow(args.tenant_id, user_id, args.years)