o
    ՂiC                     @   sz   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ ded	efd
dZejde	dddd ZdS )us   ASR 批处理任务 — 切换到 media_processing 模式，批量提取视频字幕，完成后切回 inference。    N)Path)logger)
celery_app)DatabaseTask)_switch_mode_rebuild_content_text
local_pathreturnc                 C   s   |  ddd d }z_z7tjdd| ddd	d
d|dg
ddd ddlm} |dd}|j|d}ddd |D W W t|jdd S  t	yi } zt
d|  d|  W Y d}~W t|jdd dS d}~ww t|jdd w )uC   用 FunASR paraformer-zh 提取视频音轨字幕，返回文字。.   r   z
_audio.wavffmpegz-iz-vnz-ar16000z-ac1z-yT)checkcapture_output)	AutoModelzparaformer-zh)model)input c                 s   s"    | ]}| d r|d  V  qdS )textNget).0r r   A/lsinfo/ai/hellotax_ai/data_center/backend/app/tasks/asr_tasks.py	<genexpr>   s     z_asr_video.<locals>.<genexpr>)
missing_oku   [asr_tasks] ASR 失败: u    — N )rsplit
subprocessrunfunasrr   generatejoinr   unlink	Exceptionr   warning)r   
audio_pathr   r   resulter   r   r   
_asr_video   s$   
r+   Tz!app.tasks.asr_tasks.run_asr_batch)bindbasenamec                 C   s  ddl m} ddlm} ddlm} | j}|||j	
d }dd |D }|s6td d	dd
S tdt| d | }td d}zt|D ]k}	d}
|	j	D ]}|ds`|dsaqTt|d }|ro||d< d	}
qT|
rt|	| ||	d z|  |d7 }ddlm} ||	j|	j|	j	 W qM ty } z|  td|	j d|  W Y d}~qMd}~ww qMW td ntd w td| dt|  d	|t|dS )uU   批量处理所有待 ASR 文档（inline_videos 有 path 但 transcript 为空）。r   )TaxDocument)DocumentCleaner)flag_modifiedNc                 S   s(   g | ]}t d d |jpg D r|qS )c                 s   s&    | ]}| d  o| dV  qdS )
transcriptpathNr   )r   vr   r   r   r   *   s   $ z+run_asr_batch.<locals>.<listcomp>.<genexpr>)anyinline_videos)r   dr   r   r   
<listcomp>(   s    z!run_asr_batch.<locals>.<listcomp>u'   [asr_tasks] 无待处理文档，跳过T)success	processedu   [asr_tasks] 待处理文档: u    条media_processingFr2   r3   r6   r   )index_document_mediaz[asr_tasks] doc_id=u    写库失败: 	inferenceu   [asr_tasks] 完成: processed=/)r9   r:   total)app.models.tax_datar/   0app.services.tax_data_processor.document_cleanerr0   sqlalchemy.orm.attributesr1   dbqueryfilterr6   isnotallr   infolenr   r   r+   r   commitapp.services.multimodal_indexerr<   idinline_imagesr&   rollbackerror)selfr/   r0   r1   rC   docspendingcleanerr:   docupdatedvidr2   r<   r*   r   r   r   run_asr_batch   sX   




$rW   )__doc__r    pathlibr   logurur   app.celery_appr   app.tasks.processor_tasksr   app.tasks.ocr_tasksr   r   strr+   taskrW   r   r   r   r   <module>   s    