o
    "Æi(#  ã                   @   sj   d Z ddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZ e e¡ZdZG dd„ dƒZeƒ ZdS )	zR
Entity Extractor using LLM
Extracts entities from document text using OpenAI API
é    )ÚListÚDictÚAnyÚOptionalN)ÚOpenAI)Úsettingsug  ä½ æ˜¯ä¸€ä¸ªä¸“ä¸šçš„çŸ¥è¯†å›¾è°±æž„å»ºåŠ©æ‰‹ã€‚è¯·ä»Žä»¥ä¸‹æ–‡æ¡£ä¸­æå–å…³é”®å®žä½“å’Œå…³ç³»ã€‚

æ–‡æ¡£æ ‡é¢˜ï¼š{title}
æ–‡æ¡£å†…å®¹ï¼š
{content}

è¯·æŒ‰ç…§ä»¥ä¸‹JSONæ ¼å¼è¿”å›žï¼š
{{
  "entities": [
    {{
      "name": "å®žä½“åç§°",
      "type": "PERSON|ORG|LOCATION|CONCEPT|TERM|LAW|DATE|MONEY|PERCENT",
      "description": "ç®€çŸ­æè¿°ï¼ˆå¯é€‰ï¼‰",
      "salience": 0.0-1.0
    }}
  ],
  "relations": [
    {{
      "source": "å®žä½“1åç§°",
      "target": "å®žä½“2åç§°",
      "type": "RELATED_TO|PART_OF|INSTANCE_OF|DEFINED_BY|REGULATES",
      "confidence": 0.0-1.0
    }}
  ]
}}

è¦æ±‚ï¼š
1. åªæå–é‡è¦å®žä½“ï¼ˆè‡³å°‘å‡ºçŽ°2æ¬¡æˆ–å…·æœ‰å…³é”®æ„ä¹‰ï¼‰
2. æ¯ä¸ªæ–‡æ¡£æå–5-20ä¸ªå®žä½“
3. å…³ç³»è¦æœ‰æ˜Žç¡®è¯­ä¹‰
4. salienceè¡¨ç¤ºå®žä½“åœ¨æ–‡æ¡£ä¸­çš„é‡è¦æ€§ï¼ˆ0-1ï¼‰
5. confidenceè¡¨ç¤ºå…³ç³»çš„ç½®ä¿¡åº¦ï¼ˆ0-1ï¼‰
6. å®žä½“ç±»åž‹è¯´æ˜Žï¼š
   - PERSON: äººå
   - ORG: ç»„ç»‡æœºæž„
   - LOCATION: åœ°ç‚¹
   - CONCEPT: æ¦‚å¿µæœ¯è¯­
   - TERM: ä¸“ä¸šæœ¯è¯­
   - LAW: æ³•å¾‹æ³•è§„
   - DATE: æ—¥æœŸæ—¶é—´
   - MONEY: é‡‘é¢
   - PERCENT: ç™¾åˆ†æ¯”

è¯·åªè¿”å›žJSONï¼Œä¸è¦åŒ…å«å…¶ä»–æ–‡å­—ã€‚c                   @   sÜ   e Zd ZdZdd„ Zddd„Z		dded	ed
edeee	eee
f  f fdd„Zde	eee
f  de	eee
f  fdd„Zde	eee
f  de	eee
f  fdd„Z	dde	eeef  dede	eee
f  fdd„ZdS )ÚEntityExtractorz-
    Entity extraction service using LLM
    c                 C   s   d S ©N© )Úselfr
   r
   úK/lsinfo/ai/hellotax_ai/base_platform/app/services/graph/entity_extractor.pyÚ__init__B   s   zEntityExtractor.__init__Nc              
   C   sè   |rbzEddl m}m} | |¡ |¡ |jdk|jdk|jdk|jdk|j	 
d¡¡ ¡ }|rF|jrFt|j ¡ |jjp=|jjd}||jfW S W n tya } zt d|› ¡ W Y d}~nd}~ww tjrrttjtjd}|tjfS dS )	z£
        Get model configuration from database or environment

        Returns:
            Tuple of (client, model_name) or (None, None) if not available
        r   )ÚModelProviderÚModelÚchatTN)Úapi_keyÚbase_urlz#Failed to get model from database: )NN)Úapp.models.providerr   r   ÚqueryÚjoinÚfilterÚtypeÚenabledÚ
configuredr   ÚisnotÚfirstÚproviderr   Úget_api_keyr   Údefault_base_urlÚcodeÚ	ExceptionÚloggerÚwarningr   ÚOPENAI_API_KEYÚOPENAI_BASE_URLÚGRAPH_ENTITY_EXTRACTION_MODEL)r   Ú
db_sessionr   r   ÚmodelÚclientÚer
   r
   r   Ú_get_model_configF   s<   
ûú
þ€€ÿþ
z!EntityExtractor._get_model_configé   ÚtitleÚcontentÚ
max_lengthÚreturnc              
   C   sz  |   |¡\}}|st d¡ g g dœS t|ƒ|kr*|d|… d }t d|› d¡ zTtj||d}|jjj	|dd	d
œd|d
œgddddid}|j
d jj}	t |	¡}
|  |
 dg ¡¡}|  |
 dg ¡¡}t dt|ƒ› dt|ƒ› d|› d¡ ||dœW S  tjyž } zt d|› ¡ g g dœW  Y d}~S d}~w ty¼ } zt d|› ¡ g g dœW  Y d}~S d}~ww )ai  
        Extract entities and relations from document text

        Args:
            title: Document title
            content: Document content
            max_length: Maximum content length to process
            db_session: Optional database session to get model config

        Returns:
            Dictionary with 'entities' and 'relations' lists
        z<No model configuration available, skipping entity extraction)ÚentitiesÚ	relationsNz...zContent truncated to z characters©r,   r-   ÚsystemuZ   ä½ æ˜¯ä¸€ä¸ªä¸“ä¸šçš„çŸ¥è¯†å›¾è°±æž„å»ºåŠ©æ‰‹ï¼Œæ“…é•¿ä»Žæ–‡æœ¬ä¸­æå–å®žä½“å’Œå…³ç³»ã€‚)Úroler-   Úuserg333333Ó?iÐ  r   Újson_object)r'   ÚmessagesÚtemperatureÚ
max_tokensÚresponse_formatr   r0   r1   z
Extracted z entities and z relations from 'ú'z&Failed to parse LLM response as JSON: zEntity extraction failed: )r*   r!   r"   ÚlenÚinfoÚENTITY_EXTRACTION_PROMPTÚformatr   ÚcompletionsÚcreateÚchoicesÚmessager-   ÚjsonÚloadsÚ_validate_entitiesÚgetÚ_validate_relationsÚJSONDecodeErrorÚerrorr    )r   r,   r-   r.   r&   r(   Ú
model_nameÚpromptÚresponseÚresult_textÚresultr0   r1   r)   r
   r
   r   Úextract_entitiesn   sL   

þþø
&þ€€þz EntityExtractor.extract_entitiesr0   c              	   C   sˆ   h d£}g }|D ]9}|  d¡r|  d¡sq|d  ¡ }||vr!d}t|  dd¡ƒ}|tjk r/q| |d  ¡ ||  dd¡|d	œ¡ q|S )
zË
        Validate and filter entities based on confidence threshold

        Args:
            entities: Raw entity list from LLM

        Returns:
            Filtered and validated entity list
        >	   ÚLAWÚORGÚDATEÚTERMÚMONEYÚPERSONÚCONCEPTÚPERCENTÚLOCATIONÚnamer   rW   Úsalienceç      à?ÚdescriptionÚ )rZ   r   r]   r[   ©rG   ÚupperÚfloatr   ÚGRAPH_MIN_ENTITY_CONFIDENCEÚappendÚstrip)r   r0   Úvalid_typesÚ	validatedÚentityÚentity_typer[   r
   r
   r   rF   ³   s$   




üz"EntityExtractor._validate_entitiesr1   c                 C   s’   h d£}g }|D ]>}|  d¡r|  d¡r|  d¡sq|d  ¡ }||vr&d}t|  dd¡ƒ}|tjk r4q| |d  ¡ |d  ¡ ||dœ¡ q|S )	zÑ
        Validate and filter relations based on confidence threshold

        Args:
            relations: Raw relation list from LLM

        Returns:
            Filtered and validated relation list
        >   ÚPART_OFÚ	REGULATESÚ
DEFINED_BYÚ
RELATED_TOÚINSTANCE_OFÚsourceÚtargetr   rl   Ú
confidencer\   )rn   ro   r   rp   r_   )r   r1   re   rf   ÚrelationÚrel_typerp   r
   r
   r   rH   Ø   s$   




üz#EntityExtractor._validate_relationsé   Ú	documentsÚmax_concurrentc                 C   sP   g }|D ]!}| j | dd¡| dd¡d}| | d¡|d |d dœ¡ q|S )	zÿ
        Extract entities from multiple documents

        Args:
            documents: List of documents with 'title' and 'content'
            max_concurrent: Maximum concurrent API calls

        Returns:
            List of extraction results
        r,   r^   r-   r2   Úidr0   r1   )Údocument_idr0   r1   )rP   rG   rc   )r   rt   ru   ÚresultsÚdocrO   r
   r
   r   Úextract_entities_batchý   s   

þ
ýz&EntityExtractor.extract_entities_batchr	   )r+   N)rs   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r*   ÚstrÚintr   r   r   rP   rF   rH   rz   r
   r
   r
   r   r   =   s2    
,ûþýü
ú*E*%(ýþýür   )r~   Útypingr   r   r   r   rD   ÚloggingÚopenair   Ú
app.configr   Ú	getLoggerr{   r!   r>   r   Úentity_extractorr
   r
   r
   r   Ú<module>   s    
. 
`