o
    :/i+                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dl	mZ
 d dlZd dlmZ zd dlZW n ey=   edZY nw zd dlmZ W n eyV   eddZY nw G d	d
 d
eeZeG dd dZedejdZeddZde
jej ejB dede
jej ejB fddZde
jej dedede
jej fddZde
jej dedefddZ G dd dZ!dej"de#ded ed!e#de$ej" fd"d#Z%d$ej"d%e#d&e#d'e#de#f
d(d)Z&dS )*    N)	dataclass)Enum)Literal)PlaceholderModulelibrosascipysignalc                   @   s    e Zd ZdZdZdZdZdZdS )ChannelReductionz8Method to reduce multi-channel audio to target channels.meanfirstmaxsumN)__name__
__module____qualname____doc__MEANFIRSTMAXSUM r   r   b/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/multimodal/audio.pyr	      s    r	   c                   @   sP   e Zd ZU dZdZedB ed< ejZ	eed< e
defddZdefd	d
ZdS )	AudioSpeca  Specification for target audio format.

    This dataclass defines the expected audio format for a model's feature
    extractor. It is used to normalize audio data before processing.

    Attributes:
        target_channels: Number of output channels. None means passthrough
            (no normalization). 1 = mono, 2 = stereo, etc.
        channel_reduction: Method to reduce channels when input has more
            channels than target. Only used when reducing channels.
       Ntarget_channelschannel_reductionreturnc                 C   s
   | j duS )z&Whether audio normalization is needed.Nr   selfr   r   r   needs_normalization6   s   
zAudioSpec.needs_normalizationc                 C   s&   | j d u rdS d| j  d| jj dS )NzAudioSpec(passthrough)zAudioSpec(channels=z, reduction=))r   r   valuer   r   r   r   __repr__;   s   

zAudioSpec.__repr__)r   r   r   r   r   int__annotations__r	   r   r   propertyboolr    strr#   r   r   r   r   r   %   s   
 r   r   )r   r   r   audiospecr   c                 C   s  |j s| S | jdkr|jdkr| S td|j d| jdkr(td| j d| jd | jd kr>t| tjr;| jn| j} | jd }||jkrJ| S ||jk rZtd| d	|j t| tj}|jdkr|j	t
jkr~|rvtj| dd
}|S | jdd}|S |j	t
jkr| d }|S |j	t
jkr|rtj| dd
}|S | jddj}|S |j	t
jkr|rtj| dd
}|S | jdd}|S td|j	 | d|j S )aK  Normalize audio to the specified format.

    This function handles channel reduction for multi-channel audio,
    supporting both numpy arrays and torch tensors.

    Args:
        audio: Input audio data. Can be:
            - 1D array/tensor: (time,) - already mono
            - 2D array/tensor: (channels, time) - standard format from torchaudio
            - 2D array/tensor: (time, channels) - format from soundfile
              (will be auto-detected and transposed if time > channels)
        spec: AudioSpec defining the target format.

    Returns:
        Normalized audio in the same type as input (numpy or torch).
        For mono output (target_channels=1), returns 1D array/tensor.

    Raises:
        ValueError: If audio has unsupported dimensions or channel expansion
            is requested (e.g., mono to stereo).
    r   zCannot expand mono audio to z	 channels   zUnsupported audio shape: z. Expected 1D or 2D.r   zCannot expand z channels to )axis)dimzUnknown reduction method: N)r    ndimr   
ValueErrorshape
isinstancenpndarrayTr   r	   r   r
   r   r   r   valuesr   r   )r)   r*   num_channelsis_numpyresultr   r   r   normalize_audioI   sN   






		r9   orig_sr	target_src                C   s   t j| ||dS )Nr:   r;   )r   resampler)   r:   r;   r   r   r   resample_audio_librosa   s   r?   c                C   s8   ||krt | d|| S ||k rt | || dS | S )Nr   )scipy_signalresample_polyr>   r   r   r   resample_audio_scipy   s
   rB   c                   @   sV   e Zd ZdZ		ddedB ded fddZd	eje	j
 d
edeje	j
 fddZdS )AudioResamplerz,Resample audio data to a target sample rate.Nr   r;   method)r   r   c                 C   s   || _ || _d S )N)r;   rD   )r   r;   rD   r   r   r   __init__   s   
zAudioResampler.__init__r)   r:   r   c                C   sx   | j d u r	tdtjt|t| j dddr|S | jdkr&t||| j dS | jdkr3t||| j dS td| j d	)
NzBAudio resampling is not supported when `target_sr` is not providedg        gư>)rel_tolabs_tolr   r<   r   zInvalid resampling method: z.. Supported methods are 'librosa' and 'scipy'.)	r;   RuntimeErrormathisclosefloatrD   r?   rB   r/   )r   r)   r:   r   r   r   r=      s,   


zAudioResampler.resample)Nr   )r   r   r   r   rK   r   rE   nptNDArrayr2   floatingr=   r   r   r   r   rC      s     


rC   
audio_datasample_ratemax_clip_duration_soverlap_duration_smin_energy_window_sizec                 C   s   t || }t || }g }d}|| jd k rY|| | jd kr.|| d|df  	 |S || | }	t|| | jd }
t| |	|
|}|| d||f  |}|| jd k s|S )a  Split audio into chunks with intelligent split points.

    Splits long audio into smaller chunks at low-energy regions to minimize
    cutting through speech. Uses overlapping windows to find quiet moments
    for splitting.

    Args:
        audio_data: Audio array to split. Can be 1D (mono) or multi-dimensional.
                   Splits along the last dimension (time axis).
        sample_rate: Sample rate of the audio in Hz.
        max_clip_duration_s: Maximum duration of each chunk in seconds.
        overlap_duration_s: Overlap duration in seconds between consecutive chunks.
                           Used to search for optimal split points.
        min_energy_window_size: Window size in samples for finding low-energy regions.

    Returns:
        List of audio chunks. Each chunk is a numpy array with the same shape
        as the input except for the last (time) dimension.

    Example:
        >>> audio = np.random.randn(1040000)  # 65 seconds at 16kHz
        >>> chunks = split_audio(
        ...     audio_data=audio,
        ...     sample_rate=16000,
        ...     max_clip_duration_s=30.0,
        ...     overlap_duration_s=1.0,
        ...     min_energy_window_size=1600,
        ... )
        >>> len(chunks)
        3
    r   .N)r$   r0   appendminfind_split_point)rO   rP   rQ   rR   rS   
chunk_sizeoverlap_sizechunksisearch_start
search_endsplit_pointr   r   r   split_audio   s$   &r_   wav	start_idxend_idxmin_energy_windowc           
      C   sh   | || }t j}d}tdt|| |D ]}||||  }|d  d }	|	|k r1|| }|	}q|S )a  Find the best point to split audio by looking for silence or low amplitude.

    Searches for the quietest region within a specified range by calculating
    RMS energy in sliding windows.

    Args:
        wav: Audio array. Can be 1D or multi-dimensional.
        start_idx: Start index of search region (inclusive).
        end_idx: End index of search region (exclusive).
        min_energy_window: Window size in samples for energy calculation.

    Returns:
        Index of the quietest point within the search region. This is the
        recommended split point to minimize audio artifacts.

    Example:
        >>> audio = np.random.randn(32000)
        >>> # Insert quiet region
        >>> audio[16000:17600] = 0.01
        >>> split_idx = find_split_point(
        ...     wav=audio,
        ...     start_idx=0,
        ...     end_idx=32000,
        ...     min_energy_window=1600,
        ... )
        >>> 16000 <= split_idx <= 17600
        True
    r   r+   g      ?)rI   infrangelenr
   )
r`   ra   rb   rc   segment
min_energyquietest_idxr[   windowenergyr   r   r   rW   !  s   "rW   )'rI   dataclassesr   enumr   typingr   numpyr2   numpy.typingrL   torchvllm.utils.import_utilsr   r   ImportErrorscipy.signalr   r@   placeholder_attrr(   r	   r   r   MONO_AUDIO_SPECPASSTHROUGH_AUDIO_SPECrM   rN   Tensorr9   rK   r?   rB   rC   r3   r$   listr_   rW   r   r   r   r   <module>   s   	

S


	

0
?