
    vj!                        d dl Z d dlZd dlmZmZmZmZ d dlZd dl	m c m
Z d dlZd dlmZ d dlmZ d dlmZ d dlmZmZ  G d de          Zd	 Z G d
 d          Z ej        ej                   G d de                      ZdS )    N)AnyDictTupleUnion)File)Preprocessor)PREPROCESSORS)FieldsModeKeysc                   j     e Zd ZdZej        fdedef fdZdeee	f         deee	f         fdZ
 xZS )AudioBrainPreprocessorzA preprocessor takes audio file path and reads it into tensor

    Args:
        takes: the audio file field name
        provides: the tensor field name
        mode: process mode, default 'inference'
    takesprovidesc                      t          t          |           j        |g|R i | || _        || _        dd l}|j        j        j        | _        d S )Nr   )superr   __init__r   r   speechbraindataio
read_audio)selfr   r   modeargskwargssb	__class__s          n/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/modelscope/preprocessors/audio.pyr   zAudioBrainPreprocessor.__init__   s`     	5$d++4TKDKKKFKKK
     )*5    datareturnc                 Z    |                      || j                           }||| j        <   |S N)r   r   r   )r   r   results      r   __call__zAudioBrainPreprocessor.__call__&   s+    dj!122$T]r   )__name__
__module____qualname____doc__r   	INFERENCEstrr   r   r   r#   __classcell__)r   s   @r   r   r      s          (
6 
6
6
6 
6 
6 
6 
6 
6T#s(^ S#X        r   r   c                 2   t          | dd          }|                                }|                    d          }|                    d|          }|                    d|          }t          j        ||dz   |         t          j        d	          }|                    d
          }|                    d|          }|                    d|          }t          j        ||dz   |         t          j        d	          }|                                 ||fS )Nrzutf-8)encodingAddShift[]    )dtypesepRescale)openreadfindnp
fromstringfloat32close)filenamefpall_strpos1pos2pos3meanscales           r   load_kaldi_feature_transformrE   ,   s    	hg	.	.	.BggiiG<<
##D<<T""D<<T""D=$/rzsKKKD<<	""D<<T""D<<T""DM'$(4-0
LLLEHHJJJ;r   c                   ,    e Zd ZdZ	 	 	 ddZd Zd ZdS )	Featurez%Extract feat from one utterance.
    specNFc                    || _         || _        |d         |d         z  dz  | _        |d         |d         z  dz  | _        t	          j        | j        d          | _        d| _        ||t          j	        
                    |          r]t          d|            t          |          \  }}t	          j        |          | _        t	          j        |          | _        d	| _        |rc| j                                        | _        | j        r@| j                                        | _        | j                                        | _        dS dS dS )
aF  

        Args:
            fbank_config (dict):
            feat_type (str):
                raw: do nothing
                fbank: use kaldi.fbank
                spec: Real/Imag
                logpow: log(1+|x|^2)
            mvn_file (str): the path of data file for mean variance normalization
            cuda:
        frame_lengthsample_frequencyi  frame_shiftF)periodicNzloading mvn file: T)fbank_config	feat_typen_fft
hop_lengthtorchhamming_windowwindowmvnospathexistsprintrE   
from_numpyshiftrD   cuda)r   rN   rO   mvn_filer\   r[   rD   s          r   r   zFeature.__init__?   sJ   " )"!.1L5   #'(
&}59   #'(*4:FFFBGNN8$<$<1x112227AALE5)%00DJ)%00DJDH 	/+**,,DKx /!Z__..
!Z__..


		/ 	// /r   c           	         | j         dk    r|S | j         dk    rJddlmc m} t	          |j                  dk    r|                    d          } |j        |fi | j        }n| j         dk    rit          j
        |dz  | j        | j        | j        | j        dd	
          }t          j        |j        |j        gd                              dd          }n|| j         dk    rqt          j
        || j        | j        | j        | j        dd	
          }t          j        |          dz  }t          j        d|z                                 dd          }|S )zm

        Args:
            utt: in [-32768, 32767] range

        Returns:
             [..., T, F]
        rawfbankr   Nr1   rH   i   FT)centerreturn_complexdimlogpow   )rO   torchaudio.compliance.kaldi
compliancekaldilenshape	unsqueezer`   rN   rR   stftrP   rQ   rT   catrealimagpermuteabslog)r   uttrk   featrH   abspows         r   computezFeature.computee   sq    >U""J^w&&77777777739~~""mmA&&5;s88d&788DD^v%%:e

#% % %D 9di3<<<DDRLLDD^x'':

#% % %D Yt__a'F9QZ((00R88Dr   c                 <    | j         r|| j        z   }|| j        z  }|S r!   )rU   r[   rD   )r   rw   s     r   	normalizezFeature.normalize   s)    8 	%$*$D$*$Dr   )rH   NF)r$   r%   r&   r'   r   ry   r{    r   r   rG   rG   ;   s]         
 "	$/ $/ $/ $/L& & &P    r   rG   c                   n    e Zd ZdZd Zdeeeee	f         f         deee	f         fdZ
ed             ZdS )LinearAECAndFbanki>  c                     dd l }d| j        z  | _        |d         | _        t	          |d         |d         |d                   | _        |                                | _        |d         dk    | _        d S )	Nr   i   linear_aec_delayrN   rO   rU   mask_onnearend_mic)	MinDAECSAMPLE_RATEtrunc_lengthr   rG   featureloadmitaecmask_on_mic)r   	io_configr   s      r   r   zLinearAECAndFbank.__init__   sy     4#33 )*< =y8(5y7GI Illnn$Y/=@r   r   r   c           	         t          |t                    rQ|                     |d                   \  }}|                     |d                   \  }}t          j        |          }ns|                     |d                   \  }}|                     |d                   \  }}d|v r|                     |d                   \  }}nt          j        |          }| j                            ||          \  }}}}	t          j        t          | j	        |z            g          }
t          j
        |
|g          }t          t          |          t          |          t          |          t          |	          t          |                    }d}t          || j                  }|||         |||         |||         |	||         |||         f\  }}}}	}t          j                    }t          j        t          j        |                    }| j                            |          }t          j        ||gd          }t          j        t          j        |                    }| j                            |          }t          j        ||gd          }t          j        t          j        |	                    }	| j                            |	          }t          j        ||gd          }| j                            |          }|&t          j        t          j        |                    }| j        r|}n|}|||d}|S )	u7   Linear filtering the near end mic and far end audio, then extract the feature.

        Args:
            data: Dict with two keys and correspond audios: "nearend_mic" and "farend_speech".

        Returns:
            Dict with two keys and Tensor values: "base" linear filtered audio，and "feature"
        r   r1   r   farend_speechnearend_speechrd   N)basetargetr   )
isinstancetupleload_wavr9   
zeros_liker   do_linear_aeczerosintr   concatenateminrl   r   rR   FloatTensorrZ   r;   r   ry   rp   r{   r   )r   r   r   fsr   r   out_micout_ref
out_linearout_echoextra_zerosflenfstartrw   fbank_nearend_micfbank_out_linearfbank_out_echor   out_datas                      r   r#   zLinearAECAndFbank.__call__   s    dE"" 	<"mmDG44OK $d1g 6 6M2];77NN #mmD,?@@OK $d?.C D DM24''%)]]48H3I%J%J"!#{!;!;151J1J2( 2(.*h hD$9B$> ? ?@AAn(EFFLL#g,,JX! ! 4*++F4K '&+"6vd{#XfTk%:6$;'F)BWj(N  ""&rz+'>'>?? L00==y$ 12:::%bj&<&<==
<//
;;y$ 01q999#BJx$8$899--h77y$/Q777 |%%d++ %"-bj.H.HIIN 	DDD NtLLr   c                 N   dd l }t          | t                    rt          j        |           } n^t          | t
                    r)t          j        |           }t          j        |          } n t          dt          |            d          t          j        |           \  }}t          |j                  dk    rt          d          |t          j        k    r!|                    ||t          j                  }|                    t$          j                  t          j        fS )Nr   zUnsupported input type: .r1   z(modelscope error:The audio must be mono.)librosar   bytesioBytesIOr)   r   r7   	TypeErrortypewavrl   rm   
ValueErrorr~   r   resampleastyper9   r;   )inputsr   
file_bytessample_rater   s        r   r   zLinearAECAndFbank.load_wav   s   fe$$ 	HZ''FF$$ 	H6**JZ
++FFFtF||FFFGGGHV,,Ttz??QGHHH+777##D+$5$AC CD{{2:&&(9(EEEr   N)r$   r%   r&   r   r   r   r   r   r)   r   r#   staticmethodr   r|   r   r   r~   r~      s        KA A AAU5$sCx.#89 Ad38n A A A AF F F \F F Fr   r~   )r   rV   typingr   r   r   r   numpyr9   scipy.io.wavfilewavfiler   rR   modelscope.fileior   modelscope.preprocessorsr    modelscope.preprocessors.builderr	   modelscope.utils.constantr
   r   r   rE   rG   register_moduleaudior~   r|   r   r   <module>r      s   
			 				 * * * * * * * * * * * *               " " " " " " 1 1 1 1 1 1 : : : : : : 6 6 6 6 6 6 6 6    \   6  V V V V V V V Vr v|,,_F _F _F _F _F _F _F -,_F _F _Fr   