import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / 'base_platform'))
from sqlalchemy import text
from app.db.session import SessionLocal
import json

def auto_tag_documents_simple(tenant_id: int, confidence_threshold: float=0.3):
    db = SessionLocal()
    try:
        db.execute(text(f'SET search_path TO tenant_{tenant_id}, public'))
        docs = db.execute(text('\n            SELECT id, title, content\n            FROM knowledge_documents\n            ORDER BY id\n        ')).fetchall()
        print(f'找到 {len(docs)} 个文档')
        tags = db.execute(text("\n            SELECT id, name, tag_code, keywords, search_weight\n            FROM knowledge_tags\n            WHERE is_predefined = true\n            AND tag_category_id IS NOT NULL\n            AND status = 'enabled'\n        ")).fetchall()
        print(f'找到 {len(tags)} 个可用标签\n')
        success_count = 0
        for i, (doc_id, title, content) in enumerate(docs, 1):
            if i % 100 == 0:
                print(f'处理进度: {i}/{len(docs)}')
            title_text = title.lower()
            content_text = (content or '').lower()
            doc_length = len(title) + len(content or '')
            if doc_length < 500:
                effective_threshold = confidence_threshold * 0.6
            else:
                effective_threshold = confidence_threshold
            matched_tags = []
            for tag_id, tag_name, tag_code, keywords_json, search_weight in tags:
                if keywords_json:
                    try:
                        keywords = json.loads(keywords_json) if isinstance(keywords_json, str) else keywords_json
                    except:
                        keywords = [tag_name]
                else:
                    keywords = [tag_name]
                matched_keywords = []
                title_matches = 0
                content_matches = 0
                for keyword in keywords:
                    kw_lower = keyword.lower()
                    t_count = title_text.count(kw_lower)
                    c_count = content_text.count(kw_lower)
                    if t_count > 0 or c_count > 0:
                        matched_keywords.append(keyword)
                        title_matches += t_count
                        content_matches += c_count
                if matched_keywords:
                    base_confidence = min(len(matched_keywords) / len(keywords), 1.0)
                    weighted_matches = title_matches * 2.0 + content_matches
                    frequency_boost = min(weighted_matches / 10, 0.3)
                    weight_factor = min(search_weight or 1.0, 2.0) / 2.0
                    confidence = min(base_confidence + frequency_boost, 1.0) * weight_factor
                    if confidence >= effective_threshold:
                        matched_tags.append((tag_id, confidence))
            if matched_tags:
                matched_tags.sort(key=lambda x: x[1], reverse=True)
                matched_tags = matched_tags[:5]
                for tag_id, confidence in matched_tags:
                    exists = db.execute(text('\n                        SELECT 1 FROM document_tags\n                        WHERE document_id = :doc_id AND tag_id = :tag_id\n                    '), {'doc_id': doc_id, 'tag_id': tag_id}).fetchone()
                    if not exists:
                        db.execute(text('\n                            INSERT INTO document_tags (document_id, tag_id, created_at, updated_at, is_deleted)\n                            VALUES (:doc_id, :tag_id, NOW(), NOW(), false)\n                        '), {'doc_id': doc_id, 'tag_id': tag_id})
                db.commit()
                success_count += 1
        print(f'\n✓ 完成！成功为 {success_count} 个文档打标签')
        total_tags = db.execute(text('SELECT COUNT(*) FROM document_tags')).scalar()
        tagged_docs = db.execute(text('SELECT COUNT(DISTINCT document_id) FROM document_tags')).scalar()
        print(f'\n统计:')
        print(f'  • 已打标签的文档: {tagged_docs}')
        print(f'  • 总标签数: {total_tags}')
        print(f'  • 平均每文档: {(total_tags / tagged_docs if tagged_docs > 0 else 0):.1f} 个标签')
    except Exception as e:
        db.rollback()
        print(f'错误: {e}')
        raise
    finally:
        db.close()
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='自动打标签脚本')
    parser.add_argument('--tenant-id', type=int, required=True, help='租户ID')
    parser.add_argument('--threshold', type=float, default=0.3, help='置信度阈值 (默认: 0.3)')
    args = parser.parse_args()
    print(f"{'=' * 60}")
    print(f'自动打标签 - Tenant {args.tenant_id}')
    print(f'置信度阈值: {args.threshold}')
    print(f"{'=' * 60}\n")
    auto_tag_documents_simple(args.tenant_id, args.threshold)