
    |j0"                        d dl mZ d dlZd dlmZmZmZ d dlZd dl	m
Z
 d dlmZ erd dlmZ ed         Zg ZdZdZd	Zd
ZdZdZdZdZ G d de          ZdS )    )annotationsN)TYPE_CHECKINGLiteraloverload)_check_exists_and_download)DatasettraintestgenzJhttp://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz 7d7897317ddd8ba0ae5c5fa7248d3ff5z/http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz 0791583d57d5beb693b9414c5b36798cz<s>z<e>z<unk>   c                      e Zd ZU dZded<   ded<   ded<   ded	<   ded
<   ded<   ded<   ded<   	 	 	 	 d)d*dZd+dZd,dZd-dZe		 d.d/d!            Z
e		 d.d0d$            Z
e		 d.d1d&            Z
d2d(Z
dS )3WMT14a  
    Implementation of `WMT14 <http://www.statmt.org/wmt14/>`_ test dataset.
    The original WMT14 dataset is too large and a small set of data for set is
    provided. This module will download dataset from
    http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz .

    Args:
        data_file(str|None): path to data tar file, can be set None if
            :attr:`download` is True. Default None.
        mode(str): 'train', 'test' or 'gen'. Default 'train'.
        dict_size(int): word dictionary size. Default -1.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True.

    Returns:
        Dataset: Instance of WMT14 dataset
            - src_ids (np.array) - The sequence of token ids of source language.
            - trg_ids (np.array) - The sequence of token ids of target language.
            - trg_ids_next (np.array) - The next sequence of token ids of target language.
    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.text.datasets import WMT14

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, src_ids, trg_ids, trg_ids_next):
            ...         return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)

            >>> wmt14 = WMT14(mode='train', dict_size=50)

            >>> for i in range(10):
            ...     src_ids, trg_ids, trg_ids_next = wmt14[i]
            ...     src_ids = paddle.to_tensor(src_ids)
            ...     trg_ids = paddle.to_tensor(trg_ids)
            ...     trg_ids_next = paddle.to_tensor(trg_ids_next)
            ...
            ...     model = SimpleNet()
            ...     src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
            ...     print(src_ids.item(), trg_ids.item(), trg_ids_next.item())
            91 38 39
            123 81 82
            556 229 230
            182 26 27
            447 242 243
            116 110 111
            403 288 289
            258 221 222
            136 34 35
            281 136 137

    _Wmt14DataSetModemode
str | None	data_fileint	dict_sizezlist[list[int]]src_idstrg_idstrg_ids_nextdict[str, int]src_dicttrg_dictNr
   TdownloadboolreturnNonec                L   |                                 dv sJ d|             |                                 | _        || _        | j        .|s
J d            t          |t          t
          d|          | _        |dk    s
J d            || _        |                                  d S )Nr	   z1mode should be 'train', 'test' or 'gen', but got z>data_file is not set and downloading automatically is disabledwmt14r   z*dict_size should be set as positive number)lowerr   r   r   	URL_TRAIN	MD5_TRAINr   
_load_data)selfr   r   r   r   s        j/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/text/datasets/wmt14.py__init__zWMT14.__init__p   s     zz||  
 
 
 
 FtEE	
 
 

 JJLL	">!  P 8 89i( DN
 1}}}J}}}"    c                
    dd}g  _         g  _        g  _        t          j         j        d          5 }d |D             }t          |          d	k    sJ  ||                    |d
                    j                   _	        d |D             }t          |          d	k    sJ  ||                    |d
                    j                   _
         j         d j         fd|D             }|D ]o}|                    |          D ]U}|                                }|                                                    d          }t          |          dk    rR|d
         }|                                } fdt          g|t           D             }	|d	         }
|
                                } fd|D             }t          |	          dk    st          |          dk    rg | j
        t                    } j
        t                   g|} j                             |	            j                            |            j                            |           Wq	 d d d            d S # 1 swxY w Y   d S )Nsizer   r!   r   c                    i }t          |           D ]5\  }}||k     r*|||                                                                <   5 |S N)	enumeratestripdecode)fdr.   out_dict
line_countlines        r*   	__to_dictz#WMT14._load_data.<locals>.__to_dict   sU    H$-bMM   
D$$6@HTZZ\\002233Or,   r)r   c                P    g | ]#}|j                             d           |j         $S )zsrc.dictnameendswith.0	each_items     r*   
<listcomp>z$WMT14._load_data.<locals>.<listcomp>   @       >**:66  r,      r   c                P    g | ]#}|j                             d           |j         $S )ztrg.dictr;   r>   s     r*   rA   z$WMT14._load_data.<locals>.<listcomp>   rB   r,   /c                R    g | ]#}|j                                       |j         $S  r;   )r?   r@   	file_names     r*   rA   z$WMT14._load_data.<locals>.<listcomp>   sA       >**955  r,   	r   c                P    g | ]"}j                             |t                    #S rG   )r   getUNK_IDXr?   wr)   s     r*   rA   z$WMT14._load_data.<locals>.<listcomp>   s;        ))!W55  r,   c                P    g | ]"}j                             |t                    #S rG   )r   rK   rL   rM   s     r*   rA   z$WMT14._load_data.<locals>.<listcomp>   s+    PPPt}00G<<PPPr,   P   )r.   r   r!   r   )r   r   r   tarfileopenr   lenextractfiler   r   r   r   r3   r2   splitSTARTENDappend)r)   _WMT14__to_dictfnamesr<   r7   
line_splitsrc_seq	src_wordsr   trg_seq	trg_wordsr   r   rH   s   `             @r*   r(   zWMT14._load_data   s   	 	 	 	 \$.s333 /	;q !"  E
 u::????%IammE!H&=&=t~NNDM !"  E
 u::????%IammE!H&=&=t~NNDM922ty22I   !"  E
  ; ;MM$// ; ;D;;==D!%!3!3D!9!9J:!++ (mG 'I   "'!9)!9S!9  G
 )mG 'IPPPPiPPPG 7||b((CLL2,=,= #AW#AdmC.@#AL#}U3>g>GL''000L''000%,,\::::1;;-/	; /	; /	; /	; /	; /	; /	; /	; /	; /	; /	; /	; /	; /	; /	; /	; /	; /	;s   H3I88I<?I<idxGtuple[npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_]]c                    t          j        | j        |                   t          j        | j        |                   t          j        | j        |                   fS r0   )nparrayr   r   r   )r)   ra   s     r*   __getitem__zWMT14.__getitem__   sK     HT\#&''HT\#&''HT&s+,,
 	
r,   c                *    t          | j                  S r0   )rS   r   )r)   s    r*   __len__zWMT14.__len__   s    4<   r,   .reverseLiteral[True]%tuple[dict[int, str], dict[int, str]]c                    d S r0   rG   r)   ri   s     r*   get_dictzWMT14.get_dict   	     14r,   Literal[False]%tuple[dict[str, int], dict[str, int]]c                    d S r0   rG   rm   s     r*   rn   zWMT14.get_dict   ro   r,   Mtuple[dict[str, int], dict[str, int]] | tuple[dict[int, str], dict[int, str]]c                    d S r0   rG   rm   s     r*   rn   zWMT14.get_dict   s	     sr,   Fc                    | j         | j        }}|r<d |                                D             }d |                                D             }||fS )a  
        Get the source and target dictionary.

        Args:
            reverse (bool): whether to reverse key and value in dictionary,
                i.e. key: value to value: key.

        Returns:
            Two dictionaries, the source and target dictionary.

        Examples:

            .. code-block:: python

                >>> from paddle.text.datasets import WMT14
                >>> wmt14 = WMT14(mode='train', dict_size=50)
                >>> src_dict, trg_dict = wmt14.get_dict()

        c                    i | ]\  }}||	S rG   rG   r?   kvs      r*   
<dictcomp>z"WMT14.get_dict.<locals>.<dictcomp>      :::A1:::r,   c                    i | ]\  }}||	S rG   rG   rw   s      r*   rz   z"WMT14.get_dict.<locals>.<dictcomp>  r{   r,   )r   r   items)r)   ri   r   r   s       r*   rn   zWMT14.get_dict   s^    ( "]DM( 	;::)9)9:::H::)9)9:::H!!r,   )Nr
   r   T)
r   r   r   r   r   r   r   r    r!   r"   )r!   r"   )ra   r   r!   rb   )r!   r   ).)ri   rj   r!   rk   )ri   rp   r!   rq   )ri   r    r!   rs   )F)__name__
__module____qualname____doc____annotations__r+   r(   rf   rh   r   rn   rG   r,   r*   r   r   -   sp        7 7r NNN!!!! !%")    8<; <; <; <;|
 
 
 
! ! ! ! '*4 4 4 4 X4 (+4 4 4 4 X4 !    X" " " " " "r,   r   )
__future__r   rQ   typingr   r   r   numpyrd   paddle.dataset.commonr   	paddle.ior   numpy.typingnptr   __all__URL_DEV_TESTMD5_DEV_TESTr&   r'   rV   rW   UNKrL   r   rG   r,   r*   <module>r      s   # " " " " "  3 3 3 3 3 3 3 3 3 3     < < < < < <       8 67
 Q  2 >	.	
W" W" W" W" W"G W" W" W" W" W"r,   