
    Xj>                         d dl Z d dlmZ d dlmZ d dlmZ d dlmZ  ee	          Z
 e            ZdZej         dZded	dfd
Zded	efdZddZ ej        ded          d             ZdS )    N)
celery_app)get_settings)DatabaseTask)
get_loggerzhttp://localhost:8400/v1z/internal/switch_modemodereturnc                    dd l }t          t          dd           pd}t          j        t
          d| id|id                                           t                              d|  d	           	 dd l	}|
                    d
ddd          }t          d          D ]R}|                    d          }|| k    r t                              d|              d S |                    d           St                              d|                    d                      d S # t          $ r(}t                              d|            Y d }~d S d }~ww xY w)Nr   internal_api_token r   zX-Internal-Token   )jsonheaderstimeoutu#   [ocr_tasks] 模式切换已触发: u   ，等待就绪...	localhosti  T)hostportdbdecode_responsesx   zllm:service:statusu    [ocr_tasks] 已切换到模式:    u5   [ocr_tasks] 等待模式切换超时，当前状态: u/   [ocr_tasks] 无法轮询 Redis，直接继续: )timegetattrsettingshttpxpostSWITCH_MODE_URLraise_for_statusloggerinforedisRedisrangegetsleepwarning	Exception)r   r   token_redisr_currentes           A/lsinfo/ai/hellotax_ai/data_center/backend/app/tasks/ocr_tasks.py_switch_moder.      s}   KKKH2D99?RE	Jfd^>PRW=Xbdeeevvxxx
KKNdNNNOOONLLkTLRRs 	 	Aee011G$EtEEFFFJJqMMMMlquuUiOjOjllmmmmm N N NLLLMMMMMMMMMNs    /A'D  AD   
E*EE
local_pathc           	          d}t          j        t           dddddd|  idd	|d
gdgddd          }|                                d         d         d         d                                         S )Nut   提取图片中所有文字内容，表格用Markdown格式输出，保持原始结构，不要添加任何解释。z/chat/completionszQwen3-VL-32B-Instructuser	image_urlurlzfile://)typer2   text)r4   r5   )rolecontenti   )modelmessages
max_tokensr   )r   r   choicesr   messager7   )r   r   VL_BASE_URLr   strip)r/   promptresps      r-   
_ocr_imagerA      s     DF:777H_w}  U`  pu  wM  AK  wM  wM  oN  LO  LO  Z`  jp  Qq  Qq  Kr  os  os  nt  DH  ?I  ?I  SV  W  W  WD99;;y!!$Y/	:@@BBB    c                    ddl m} | j        sd S  || j        d          }| j        pg D ]c}|                    d          s|                    d|d         fd          }|r)|                     |d	|d          d
d                     d|                    |                    t          |                              
                    dd          | _        d S )Nr   )BeautifulSouplxmlocr_textimgsrc_originalc                     | o|| v S )N )ssrcs     r-   <lambda>z'_rebuild_content_text.<locals>.<lambda>,   s    1>QTXYQY rB   )rL   z<p class="ocr-text">z</p>zhtml.parser r   )bs4rD   content_htmlinline_imagesr#   findinsert_aftermarkdown_to_rag_texthtml_to_markdownstrreplacecontent_text)doccleanerrD   soupimg_infotags         r-   _rebuild_content_textr^   $   s
   !!!!!! =)622D%+ r r||J'' 	ii.1I#Y#Y#YiZZ 	r]]+`HZDX+`+`+`boppqqq33G4L4LSQUYY4W4WXX``agikllCrB   Tz#app.tasks.ocr_tasks.run_media_batch)bindbasenamec                    ddl m} ddlm} ddlm} | j        }|                    |                              |j	        
                    d                                                     }d |D             }|st                              d           dddS t                              d	t          |           d
            |            }t          d           d}	 |D ]}	d}
|	j	        D ]}|                    d          s|                    d          s-	 t#          |d                   |d<   d}
I# t$          $ r8}t                              d|	j         d|d          d|            Y d }~d }~ww xY w|
r!t+          |	|            ||	d           	 |                                 |dz  }ddlm}  ||	j        |	j	        |	j                   |	j        ro|	j        rhdd l}ddlm}  |t>          j         tC          t>          dd                     }|"                    |#                    |	j        |	j$                             m# t$          $ rD}|%                                 t          &                    d|	j         d|            Y d }~d }~ww xY w	 t          d           n# t          d           w xY wt                              d| dt          |                      d|t          |          dS )Nr   )flag_modified)TaxDocument)DocumentCleanerc                 R    g | ]$}t          d  |j        pg D                       "|%S )c              3   l   K   | ]/}|                     d            o|                     d          V  0dS )rF   pathN)r#   ).0rG   s     r-   	<genexpr>z-run_media_batch.<locals>.<listcomp>.<genexpr>9   sA      %s%sVY#''**=*=&=&Q#''&//%s%s%s%s%s%srB   )anyrQ   )ri   ds     r-   
<listcomp>z#run_media_batch.<locals>.<listcomp>9   s>    tttQ#%s%s]^]l]rpr%s%s%s"s"stqtttrB   u'   [ocr_tasks] 无待处理文档，跳过T)success	processedu   [ocr_tasks] 待处理文档: u    条media_processingFrF   rh   z[ocr_tasks] doc_id=u    OCR 失败: u    — rQ      )index_document_media)KnowledgeBaseClientbase_platform_api_key)base_urlapi_keyu    写库失败: 	inferenceu   [ocr_tasks] 完成: processed=/)rn   ro   total)'sqlalchemy.orm.attributesrc   app.models.tax_datard   0app.services.tax_data_processor.document_cleanerre   r   queryfilterrQ   isnotallr   r   lenr.   r#   rA   r&   r%   idr^   commitapp.services.multimodal_indexerrr   inline_videosis_importedknowledge_doc_idasyncio5app.services.tax_data_processor.knowledge_base_clientrs   r   base_platform_urlr   runupdate_document_contentrX   rollbackerror)selfrc   rd   re   r   docspendingrZ   ro   rY   updatedrG   r,   rr   _asynciors   _clients                    r-   run_media_batchr   1   s   777777//////PPPPPP	B88K  ''(A(G(G(M(MNNRRTTDtt$tttG 1=>>>a000
KKBGBBBCCCoG#$$$I!" 	S 	SCG( e e77:&& cggfoo e&0V&=&=C
O"GG  e e eNN#c#c#ccRXk#c#c`a#c#cdddddddde S%c7333c?333SIIKKKNITTTTTT((1BCDUVVV 	n3+? 	n2222     
 #6"5x?Ycjks  vM  OS  dT  dT  #U  #U  #U W%D%DSEY[^[k%l%lmmm  S S SKKMMMLL!Qsv!Q!Qa!Q!QRRRRRRRRS%S	S@ 	[!!!![!!!!
KKKKKS\\KKLLL)c'llKKKsa    ;J3 D76J3 7
E9.E4/J3 4E99"J3 B1IJ3 
J9JJ3 JJ3 3K)r   N)r   app.celery_appr   
app.configr   app.tasks.processor_tasksr   common_loggingr   __name__r   r   r=   r   r   rV   r.   rA   r^   taskr   rJ   rB   r-   <module>r      s.    % % % % % % # # # # # # 2 2 2 2 2 2 % % % % % %	H		<>>(/FFFNs Nt N N N N$C3 C3 C C C C
m m m m d4YZZZ2L 2L [Z2L 2L 2LrB   