
    #j>                         d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZ d dlZd dlZddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ e	rd dlm Z  ddl!m"Z" e G d d                      Z#ddde$fdZ%dS )    N)	dataclassfield)BytesIO)Path)TYPE_CHECKINGAnyClassVarOptionalUnion   )config)DownloadConfig)
array_cast)is_local_pathis_remote_urlxopen)no_op_if_value_is_nullstring_to_dictAudioDecoder   )FeatureTypec                   \   e Zd ZU dZdZee         ed<   dZe	ed<   dZ
ee         ed<   dZee         ed<    edd	          Zee         ed
<   dZee         ed<    ej         ej                     ej                    d          Zee         ed<    ed dd          Zeed<   d Zdeeeeedf         defdZ	 ddedeeeeee	df         f                  ddfdZdedeedf         f         fdZdeej         ej!        f         dej!        fdZ"	 d dej!        de	de	dej!        fdZ#dS )!Audioa
  Audio [`Feature`] to extract audio data from an audio file.

    Input: The Audio feature accepts as input:
    - A `str`: Absolute path to the audio file (i.e. random access is allowed).
    - A `pathlib.Path`: path to the audio file (i.e. random access is allowed).
    - A `dict` with the keys:

        - `path`: String with relative path of the audio file to the archive file.
        - `bytes`: Bytes content of the audio file.

      This is useful for parquet or webdataset files which embed audio files.

    - A `dict` with the keys:

        - `array`: Array containing the audio sample
        - `sampling_rate`: Integer corresponding to the sampling rate of the audio sample.

    - A `torchcodec.decoders.AudioDecoder`: torchcodec audio decoder object.

    Output: The Audio features output data as `torchcodec.decoders.AudioDecoder` objects, with additional keys:

    - `array`: Array containing the audio sample
    - `sampling_rate`: Integer corresponding to the sampling rate of the audio sample.

    Args:
        sampling_rate (`int`, *optional*):
            Target sampling rate. If `None`, the native sampling rate is used.
        num_channels (`int`, *optional*):
             The desired number of channels of the samples. By default, the number of channels of the source is used.
             Audio decoding will return samples with shape (num_channels, num_samples)
             Currently `None` (number of channels of the source, default), `1` (mono) or `2` (stereo) channels are supported.
             The `num_channels` argument is passed to `torchcodec.decoders.AudioDecoder`.

             <Added version="4.4.0"/>
        decode (`bool`, defaults to `True`):
            Whether to decode the audio data. If `False`,
            returns the underlying dictionary in the format `{"path": audio_path, "bytes": audio_bytes}`.
        stream_index (`int`, *optional*):
            The streaming index to use from the file. If `None` defaults to the "best" index.

    Example:

    ```py
    >>> from datasets import load_dataset, Audio
    >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train")
    >>> ds = ds.cast_column("audio", Audio(sampling_rate=44100, num_channels=2))
    >>> ds[0]["audio"]
    <datasets.features._torchcodec.AudioDecoder object at 0x11642b6a0>
    >>> audio = ds[0]["audio"]
    >>> audio.get_samples_played_in_range(0, 10)
    AudioSamples:
        data (shape): torch.Size([2, 110592])
        pts_seconds: 0.0
        duration_seconds: 2.507755102040816
        sample_rate: 44100
    ```
    Nsampling_rateTdecodenum_channelsstream_indexF)defaultrepriddictdtypebytespathpa_type)r   initr    _typec                     | j         S N)r'   )selfs    a/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/datasets/features/audio.py__call__zAudio.__call__]   s
    |    valuer   returnc                    	 ddl }ddlm} n"# t          $ r}t          d          |d}~ww xY w|t	          d          t
          j        rddlm} nd}t          |t                    rd|dS t          |t                    r$dt          |                                          dS t          |t          t          f          r|ddS |t          ||          rt          |          S d|v rt!                      } ||                    |d                             t&          j                            |d	         
                              |d| j                   |                                ddS |                    d          }t2          j                            |d                   rW|d                             d          r#|                    d	          t;          d          |                    d          rGt'          j        |d         t&          j                                      t&          j                  dz  }n=t'          j         |d         dd                              t&          j                  dz  }t!                      } ||                    |          |d	         
                              |d| j                   |                                ddS d|                    d          dS |                    d          |                    d          +|                    d          |                    d          dS t	          d| d          )zEncode example into a format for Arrow.

        Args:
            value (`str`, `bytes`,`bytearray`,`dict`, `AudioDecoder`):
                Data passed as input to Audio feature.

        Returns:
            `dict`
        r   NAudioEncoder<To support encoding audio data, please install 'torchcodec'.zvalue must be providedr   r$   arrayr   sample_ratewavformatr   r&   pcmzBTo use PCM files, please specify a 'sampling_rate' in Audio objectr%   )r#   i  hr)r#   modezUAn audio sample should have one of 'path' or 'bytes' but they are missing or None in .)!torchtorchcodec.encodersr4   ImportError
ValueErrorr   TORCHCODEC_AVAILABLEtorchcodec.decodersr   
isinstancestrr   absoluter%   	bytearrayencode_torchcodec_audior   
from_numpyastypenpfloat32to_file_liker   getvaluegetosr&   isfileendswithKeyError
frombufferint16memmap)r,   r0   rA   r4   errr   bufferbytes_values           r-   encode_examplezAudio.encode_example`   s   	gLLL8888888 	g 	g 	g\]]cff	g =5666& 	 8888888  LeS!! )	!5111t$$ '	!3u~~/?/?+@+@AAAy122 %	"D111%*UL*I*I%*5111YYFL  w!6!6rz!B!BCCQVWfQg  l6%d>OlPPP#__..===YYv*rw~~eFm/L/L*V}%%e,, B99_--5"#ghhh99W%% k"$-gbh"O"O"O"V"VWYWa"b"bej"jKK"$)E&M3"O"O"O"V"VWYWa"b"bej"jK U--k::oH^___ll5t7H m    "(!2!2DAAA!%uyy/@/@AAAYYw+uyy/@/@/L"YYw//69J9JKKKphmppp  s   
 
,',token_per_repo_idc                 F   t           j        rddlm} nt	          d          | j        st          d          |d         |d         |d         fn	|d         df\  }}||t          d| d	          |.t          |          r ||| j	        | j
        | j        
          }n||pi }|                    d          d         }|                    t           j                  rt           j        nt           j        }t#          ||          }	|	|                    |	d                   nd}
t'          |
          }t)          |d|          } ||| j	        | j
        | j        
          }n ||| j	        | j
        | j        
          }||d|_        ||j        _        |S )a,  Decode example audio file into audio data.

        Args:
            value (`dict`):
                A dictionary with keys:

                - `path`: String with relative audio file path.
                - `bytes`: Bytes of the audio file.
            token_per_repo_id (`dict`, *optional*):
                To access and decode
                audio files from private repositories on the Hub, you can pass
                a dictionary repo_id (`str`) -> token (`bool` or `str`)

        Returns:
            `torchcodec.decoders.AudioDecoder`
        r   r   z<To support decoding audio data, please install 'torchcodec'.zMDecoding is disabled for this feature. Please use Audio(decode=True) instead.r%   Nr&   zJAn audio sample should have one of 'path' or 'bytes' but both are None in r@   )r   r8   r   ::repo_idtokenrbdownload_config)r&   r%   )r   rE   _torchcodecr   rC   r   RuntimeErrorrD   r   r   r   r   split
startswithHF_ENDPOINTHUB_DATASETS_URLHUB_DATASETS_HFFS_URLr   rR   r   r   _hf_encodedmetadatar&   )r,   r0   r^   r   r&   r%   audio
source_urlpatternsource_url_fieldsrd   rg   fs                r-   decode_examplezAudio.decode_example   s   & & 	^1111111\]]]{ 	pnooo9>w9SuV}eGn55Z_`fZgimYne<EMrjorrrsss=]400= L4#4$BTcgct  EE ] 1 7RD))"-J+5+@+@AS+T+Tv''Z`Zv  !/z7 C CK\Kh%))*;I*FGGGnrE,5999OdD/BBBA L 1t?Q`d`q  EE
 !LD$54CUdhdu  E &*E::"r/   r   c                 h    ddl m} | j        rt          d           |d           |d          dS )z[If in the decodable state, raise an error, otherwise flatten the feature into a dictionary.r   )Valuez'Cannot flatten a decoded Audio feature.binarystringr$   )featuresrx   r   rD   )r,   rx   s     r-   flattenzAudio.flatten   sS    ######; 	HFGGGU8__E(OO
 
 	
r/   storagec                 @   t           j                            |j                  rrt          j        dgt          |          z  t          j                              }t           j                            ||gddg|	                                          }nt           j        
                    |j                  rt          |t          j                              }t          j        dgt          |          z  t          j                              }t           j                            ||gddg|	                                          }n=t           j                            |j                  rrt          j        dgt          |          z  t          j                              }t           j                            ||gddg|	                                          }nt           j                            |j                  rN|j                            d          r4t          j        d |                    d	          D                       }n5t           j                            |j                  r|j                            d          d
k    r|                    d          }n8t          j        dgt          |          z  t          j                              }|j                            d          d
k    r|                    d          }n8t          j        dgt          |          z  t          j                              }t           j                            ||gddg|	                                          }t          || j                  S )a  Cast an Arrow array to the Audio arrow storage type.
        The Arrow types that can be converted to the Audio pyarrow storage type are:

        - `pa.string()` - it must contain the "path" data
        - `pa.binary()` - it must contain the audio bytes
        - `pa.struct({"bytes": pa.binary()})`
        - `pa.struct({"path": pa.string()})`
        - `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter

        Args:
            storage (`Union[pa.StringArray, pa.StructArray]`):
                PyArrow array to cast.

        Returns:
            `pa.StructArray`: Array in the Audio arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`
        Ntyper%   r&   maskr6   c                 X    g | ]'}|!t                                          |          nd (S r+   )r   r]   ).0xs     r-   
<listcomp>z&Audio.cast_storage.<locals>.<listcomp>
  s3    vvv!am''***vvvr/   F)zero_copy_onlyr   )patypes	is_stringr   r6   lenry   StructArrayfrom_arraysis_nullis_large_binaryr   rz   	is_binary	is_structget_all_field_indicesto_numpyget_field_indexr   r'   )r,   r}   bytes_array
path_arrays       r-   cast_storagezAudio.cast_storage   s   $ 8gl++ 	w(D6CLL#8ry{{KKKKn00+w1G'SYIZahapaparar0ssGGX%%gl33 	w  G 4&3w<<"7bikkJJJJn00':1FRXHY`g`o`o`q`q0rrGGX-- 	w4&3w<<"7bikkJJJJn00':1FRXHY`g`o`o`q`q0rrGGX-- 	w',2T2TU\2]2] 	whvvwO_O_otO_OuOuvvv GG X-- 		w|++G4499%mmG44 hvG'<29;;OOO|++F33q88$]]622

Xtfs7||&;")++NNN
n00+z1JWV\L]dkdsdsdudu0vvG'4<000r/   local_filesremote_filesc                    i t           fd            t          j        fd|                                D             t          j                              }t          j        fd|                    d                                          D             t          j                              }t          j                            ||gddg|	                                          }t          || j                  S )	a7  Embed audio files into the Arrow array.

        Args:
            storage (`pa.StructArray`):
                PyArrow array to embed.
            token_per_repo_id (`dict`, optional):
                Dictionary repo_id -> token to fetch the files bytes.
            local_files (`bool`, defaults to `True`)
                Whether to embed local files data in the array

                <Added version="4.8.5"/>
            remote_files (`bool`, defaults to `True`)
                Whether to embed remote files data in the array.
                E.g. files with paths that start with hf:// or https://

                <Added version="4.8.5"/>

        Returns:
            `pa.StructArray`: Array in the Audio arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
        Nc                    |                      d          d         }|                    t          j                  rt          j        nt          j        }t          ||          }|                    |d                   nd }t          |          }t          | d|          5 }|
                                cd d d            S # 1 swxY w Y   d S )Nr`   ra   rb   rc   re   rf   )rj   rk   r   rl   rm   rn   r   rR   r   r   read)r&   rr   rs   rt   rd   rg   ru   r^   s          r-   path_to_bytesz*Audio.embed_storage.<locals>.path_to_bytes3  s   D))"-J+5+@+@AS+T+Tv''Z`Zv  !/z7 C CK\Kh%))*;I*FGGGnrE,5999OtT?CCC  qvvxx                                   s   %CC
C
c                     g | ]U}|O|d         ?rt          |d                   sr&t          |d                   r |d                   n	|d         nd VS )Nr%   r&   )r   r   )r   r   r   r   r   s     r-   r   z'Audio.embed_storage.<locals>.<listcomp>@  s     
 
 
  =	 z)% **7&	*B*B *HT *YfghiogpYqYq * "M!F),,, 7 
 
 
r/   r   c                     g | ]I}|Crt          |          sr.t          |          rt          j                            |          n|nd JS r+   )r   r   rS   r&   basename)r   r&   r   r   s     r-   r   z'Audio.embed_storage.<locals>.<listcomp>N  s     	 	 	  # $(5d(;(;AMR_`dReReBG$$T*** 	 	 	r/   r&   r%   r   )r   r   r6   	to_pylistry   r   rz   r   r   r   r   r'   )r,   r}   r^   r   r   r   r   r   s     ```  @r-   embed_storagezAudio.embed_storage  sI   0 $ "			  		  		  		  
 			  h
 
 
 
 
 
 !**,,
 
 
 
 
 
 X	 	 	 	 	 $MM&11;;==	 	 	 
 
 

 .,,k:-FRXHY`g`o`o`q`q,rr'4<000r/   r+   )NTT)$__name__
__module____qualname____doc__r   r
   int__annotations__r   boolr   r   r   r!   rH   r#   r	   r   structry   rz   r'   r   r)   r.   r   r%   rJ   r"   r]   rv   r|   StringArrayr   r   r    r/   r-   r   r      sM        8 8t $(M8C='''FD"&L(3-&&&"&L(3-&&&d777B777!E8C=!!!&RYibikk'R'RSSGXc]SSSwU???E3???  BE#ui~*U$V B[_ B B B BJ ]a9 99.6tCsDRVAW<W7X.Y9	9 9 9 9v	
}d33E.FFG 	
 	
 	
 	
,1E".".*H$I ,1bn ,1 ,1 ,1 ,1^ osC1 C1~C1LPC1gkC1	C1 C1 C1 C1 C1 C1r/   r   rq   r   r1   c                    t          | d          r| j        S 	 ddlm} n"# t          $ r}t	          d          |d }~ww xY w|                                 }t                      }|j        j        d         } ||j        	                                |j
                                      |d|           |                                d dS )	Nro   r   r3   r5   r7   r9   r:   r$   )hasattrro   rB   r4   rC   get_all_samplesr   datashapecpur8   rP   rQ   )rq   r4   rZ   samplesr[   r   s         r-   rK   rK   ^  s    um$$ :  	g8888888 	g 	g 	g\]]cff	g ''))|)!,W\%%''W5HIIIVV5| 	W 	
 	
 	
  **D999s     
?:?)&rS   dataclassesr   r   ior   pathlibr   typingr   r   r	   r
   r   numpyrN   pyarrowr    r   download.download_configr   tabler   utils.file_utilsr   r   r   utils.py_utilsr   r   rF   r   r{   r   r   r"   rK   r   r/   r-   <module>r      s   				 ( ( ( ( ( ( ( (             @ @ @ @ @ @ @ @ @ @ @ @ @ @               5 5 5 5 5 5       B B B B B B B B B B C C C C C C C C  &000000%%%%%% C1 C1 C1 C1 C1 C1 C1 C1L
:> :d : : : : : :r/   