o
    ia1                     @   s|   U d Z ddlmZmZmZmZ ddlZddlZddlm	Z	 e
eZdZdZG dd dZdaee ed	< d
efddZdS )zG
Graph-based Reranking Service
Advanced reranking using graph features
    )ListDictAnyOptionalN)Neo4jClientg333333?g333333?c                   @   s   e Zd ZdZdefddZ		ddeeee	f  dede
d	e
d
eeeef  deeee	f  fddZdee
 de
d	e
dee
ef fddZdee
 de
d	e
dee
ef fddZdee
 de
d	e
dee
ef fddZdS )GraphRerankerz\
    Graph-based reranking service
    Uses graph features to improve retrieval quality
    neo4j_clientc                 C   s
   || _ d S )Nr   )selfr    r   I/lsinfo/ai/hellotax_ai/base_platform/app/services/graph/graph_reranker.py__init__   s   
zGraphReranker.__init__Nresultsquery	tenant_idkb_idweightsreturnc              
   C   s  |s|S |du rddddd}dd |D }i }|durWz ddl m}	 ||	j|	j|	j| }
d	d
 |
D }W n tyV } zt	
d|  W Y d}~nd}~ww | |||}| |||}| |||}|D ]z}|d }|dd}||d}||d}||d}|d | |d |  |d |  |d |  }||d}|dkr|t9 }d|d< d|d< n|dkr|t9 }d|d< d|d< n||d< d|d< ||d< ||d< ||d< ||d< ||d< qn|jdd  dd! td"d# |D }t	d$t| d%| d& |S )'u  
        Rerank results using graph features

        Args:
            results:   Initial retrieval results
            query:     Query text
            tenant_id: Tenant ID
            kb_id:     Knowledge base ID
            weights:   Feature weights (default: balanced)
            db:        SQLAlchemy Session — needed for doc_status penalty lookup.
                       If None, obsolete penalty is skipped.

        Returns:
            Reranked results
        Ng?g?)vector_scorepagerankentity_overlappath_distancec                 S      g | ]}|d  qS )document_idr   .0rr   r   r   
<listcomp>B       z(GraphReranker.rerank.<locals>.<listcomp>r   )KnowledgeDocumentc                 S   s   i | ]	}|j |jp
d qS )	effective)id
doc_status)r   rowr   r   r   
<dictcomp>N       z(GraphReranker.rerank.<locals>.<dictcomp>u8   [GraphReranker] doc_status 查询失败（非致命）: r   score      ?        r   r   r   r   r    obsoleter"   Tstatus_penalizedamendedForiginal_scorepagerank_scoreentity_overlap_scorepath_distance_scorec                 S   s   | d S )Nr&   r   )xr   r   r   <lambda>   s    z&GraphReranker.rerank.<locals>.<lambda>)keyreversec                 s   s    | ]
}| d rdV  qdS )r*      N)getr   r   r   r   	<genexpr>   s    z'GraphReranker.rerank.<locals>.<genexpr>z	Reranked z results using graph features (z obsolete/amended penalized))app.models.knowledge_baser   r   r!   r"   filterin_all	Exceptionloggerwarning_calculate_pagerank_calculate_entity_overlap_calculate_path_distancesr5   _OBSOLETE_SCORE_PENALTY_AMENDED_SCORE_PENALTYsortsuminfolen)r
   r   r   r   r   r   dbdoc_idsdoc_status_mapr   rowsepagerank_scoresentity_overlap_scorespath_distancesresultdoc_idr   r   r   r   combined_scorestatuspenalized_countr   r   r   rerank   s~   






zGraphReranker.rerankrH   c           	   
      s  z_d}d}z| j j||||dd}W n+ ty>   | j j||||dd}|r1tdd |D nd  fdd|D }Y nw d	d
 |D }|r]t| dkr]fdd
| D }|W S  ty } ztd|  dd
 |D W  Y d}~S d}~ww )z
        Calculate PageRank scores for documents

        Args:
            doc_ids: Document IDs
            tenant_id: Tenant ID
            kb_id: Knowledge base ID

        Returns:
            Document ID -> PageRank score mapping
        aT  
            MATCH (d:Document)
            WHERE d.id IN $doc_ids
              AND d.tenant_id = $tenant_id
              AND d.kb_id = $kb_id
            CALL gds.pageRank.stream({
                nodeProjection: 'Document',
                relationshipProjection: {
                    REFERENCES: {
                        type: 'REFERENCES',
                        orientation: 'NATURAL'
                    },
                    SIMILAR_TO: {
                        type: 'SIMILAR_TO',
                        orientation: 'UNDIRECTED'
                    }
                },
                maxIterations: 20,
                dampingFactor: 0.85
            })
            YIELD nodeId, score
            WITH gds.util.asNode(nodeId) AS doc, score
            WHERE doc.id IN $doc_ids
            RETURN doc.id as doc_id, score
            a  
            MATCH (d:Document)
            WHERE d.id IN $doc_ids
              AND d.tenant_id = $tenant_id
              AND d.kb_id = $kb_id
            OPTIONAL MATCH (d)-[r:REFERENCES|SIMILAR_TO]-(other:Document)
            WHERE other.tenant_id = $tenant_id
              AND other.kb_id = $kb_id
            WITH d.id as doc_id, count(r) as degree
            RETURN doc_id, degree
            rH   r   r   
parametersc                 S   r   )degreer   r   r   r   r   r      r   z5GraphReranker._calculate_pagerank.<locals>.<listcomp>r4   c                    s"   g | ]}|d  |d   dqS )rP   rX   rP   r&   r   r   )
max_degreer   r   r      s    c                 S      i | ]	}|d  |d qS rY   r   r   r   r   r   r$      r%   z5GraphReranker._calculate_pagerank.<locals>.<dictcomp>r   c                       i | ]	\}}||  qS r   r   r   kv	max_scorer   r   r$      r%   zFailed to calculate PageRank: c                 S      i | ]}|d qS r(   r   r   rP   r   r   r   r$          N)r   execute_queryr;   maxvaluesitemsr<   error)	r
   rH   r   r   r   fallback_queryr   scoresrK   r   )rZ   ra   r   r>      sF   
	

z!GraphReranker._calculate_pagerankc           	   
      s   z<d}| j j||||dd}dd |D }|r-t|   dkr- fdd| D }|D ]
}||vr9d||< q/|W S  ty] } ztd	|  d
d |D W  Y d}~S d}~ww )z
        Calculate entity co-occurrence scores

        Args:
            doc_ids: Document IDs
            tenant_id: Tenant ID
            kb_id: Knowledge base ID

        Returns:
            Document ID -> entity overlap score mapping
        a  
            MATCH (d1:Document)
            WHERE d1.id IN $doc_ids
              AND d1.tenant_id = $tenant_id
              AND d1.kb_id = $kb_id
            MATCH (d1)-[:CONTAINS]->(e:Entity)<-[:CONTAINS]-(d2:Document)
            WHERE d2.id IN $doc_ids
              AND d2.tenant_id = $tenant_id
              AND d2.kb_id = $kb_id
              AND d1.id <> d2.id
            WITH d1.id as doc_id, count(DISTINCT e) as shared_entities
            RETURN doc_id, shared_entities
            rU   rV   c                 S   r[   )rP   shared_entitiesr   r   r   r   r   r$     r%   z;GraphReranker._calculate_entity_overlap.<locals>.<dictcomp>r   c                    r\   r   r   r]   r`   r   r   r$     r%   r(   z$Failed to calculate entity overlap: c                 S   rb   rc   r   rd   r   r   r   r$   #  re   N)r   rf   rg   rh   ri   r;   r<   rj   )	r
   rH   r   r   r   r   rl   rP   rK   r   r`   r   r?      s0   
z'GraphReranker._calculate_entity_overlapc              
   C   s   z3d}| j j||||dd}i }|D ]}|d }t| ||d < q|D ]
}	|	|vr0d||	< q&|W S  tyT }
 ztd|
  dd	 |D W  Y d
}
~
S d
}
~
ww )a  
        Calculate average path distances between documents

        Args:
            doc_ids: Document IDs
            tenant_id: Tenant ID
            kb_id: Knowledge base ID

        Returns:
            Document ID -> path distance score mapping (closer = higher score)
        a  
            MATCH (d1:Document)
            WHERE d1.id IN $doc_ids
              AND d1.tenant_id = $tenant_id
              AND d1.kb_id = $kb_id
            MATCH (d2:Document)
            WHERE d2.id IN $doc_ids
              AND d2.tenant_id = $tenant_id
              AND d2.kb_id = $kb_id
              AND d1.id <> d2.id
            MATCH path = shortestPath((d1)-[*..3]-(d2))
            WITH d1.id as doc_id, avg(length(path)) as avg_distance
            RETURN doc_id, avg_distance
            rU   rV   avg_distancerP   g?z$Failed to calculate path distances: c                 S   rb   )r'   r   rd   r   r   r   r$   _  re   z;GraphReranker._calculate_path_distances.<locals>.<dictcomp>N)r   rf   mathexpr;   r<   rj   )r
   rH   r   r   r   r   rl   r   distancerP   rK   r   r   r   r@   %  s.   
z'GraphReranker._calculate_path_distances)NN)__name__
__module____qualname____doc__r   r   r   r   strr   intr   floatrT   r>   r?   r@   r   r   r   r   r      sZ    	
l

a

;
r   graph_rerankerr   c                  C   s    t du rddlm}  t| a t S )z%Get or create graph reranker instanceNr   r	   )ry   app.services.graph.neo4j_clientr   r   r	   r   r   r   get_graph_rerankerf  s   r{   )ru   typingr   r   r   r   loggingro   rz   r   	getLoggerrr   r<   rA   rB   r   ry   __annotations__r{   r   r   r   r   <module>   s    
  Q