
    |jR.                        d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
Zd dlZd dlmZ d dlmZ erd dlmZ ed         Zed         Zg Zd	Zd
ZdZdZdZdZdZ G d de          ZdS )    )annotationsN)defaultdict)TYPE_CHECKINGLiteraloverload)_check_exists_and_download)Datasettraintestval)endez2http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz 0c38be43600334966403524a40dcd81ei+  iK  z<s>z<e>z<unk>c                  f   e Zd ZU dZded<   ded<   ded<   ded	<   ded
<   ded<   ded<   ded<   ded<   ded<   	 	 	 	 	 	 d4d5dZe	 d6d7d!            Ze	 d6d8d#            Ze	 d6d9d%            Zd:d'Zd;d*Zd<d+Z	d=d.Z
d>d/Ze	 d6d?d0            Ze	 d6d@d1            Ze	 d6dAd2            Zd:d3ZdS )BWMT16a  
    Implementation of `WMT16 <http://www.statmt.org/wmt16/>`_ test dataset.
    ACL2016 Multimodal Machine Translation. Please see this website for more
    details: http://www.statmt.org/wmt16/multimodal-task.html#task1

    If you use the dataset created for your task, please cite the following paper:
    Multi30K: Multilingual English-German Image Descriptions.

    .. code-block:: text

        @article{elliott-EtAl:2016:VL16,
         author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
         title     = {Multi30K: Multilingual English-German Image Descriptions},
         booktitle = {Proceedings of the 6th Workshop on Vision and Language},
         year      = {2016},
         pages     = {70--74},
         year      = 2016
        }

    Args:
        data_file(str|None): path to data tar file, can be set None if
            :attr:`download` is True. Default None.
        mode(str): 'train', 'test' or 'val'. Default 'train'.
        src_dict_size(int): word dictionary size for source language word. Default -1.
        trg_dict_size(int): word dictionary size for target language word. Default -1.
        lang(str): source language, 'en' or 'de'. Default 'en'.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True.

    Returns:
        Dataset: Instance of WMT16 dataset. The instance of dataset has 3 fields:
            - src_ids (np.array) - The sequence of token ids of source language.
            - trg_ids (np.array) - The sequence of token ids of target language.
            - trg_ids_next (np.array) - The next sequence of token ids of target language.

    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.text.datasets import WMT16

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, src_ids, trg_ids, trg_ids_next):
            ...         return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)

            >>> wmt16 = WMT16(mode='train', src_dict_size=50, trg_dict_size=50)

            >>> for i in range(10):
            ...     src_ids, trg_ids, trg_ids_next = wmt16[i]
            ...     src_ids = paddle.to_tensor(src_ids)
            ...     trg_ids = paddle.to_tensor(trg_ids)
            ...     trg_ids_next = paddle.to_tensor(trg_ids_next)
            ...
            ...     model = SimpleNet()
            ...     src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
            ...     print(src_ids.item(), trg_ids.item(), trg_ids_next.item())
            89 32 33
            79 18 19
            55 26 27
            147 36 37
            106 22 23
            135 50 51
            54 43 44
            217 30 31
            146 51 52
            55 24 25
    _Wmt16DataSetModemode
str | None	data_file_Wmt16Languagelangintsrc_dict_sizetrg_dict_sizedict[str, int]src_dicttrg_dictzlist[list[int]]src_idstrg_idstrg_ids_nextNr   r   TdownloadboolreturnNonec                   |                                 dv sJ d|             |                                 | _        || _        | j        .|s
J d            t          |t          t
          d|          | _        || _        |dk    s
J d            |dk    s
J d            t          ||dk    rt          nt                    | _
        t          ||dk    rt          nt                    | _        |                     ||          | _        |                     |dk    rdnd|          | _        |                                 | _        d S )	Nr
   z1mode should be 'train', 'test' or 'val', but got z>data_file is not set and downloading automatically is disabledwmt16r   z*dict_size should be set as positive numberr   r   )lowerr   r   r   DATA_URLDATA_MD5r   minTOTAL_EN_WORDSTOTAL_DE_WORDSr   r   
_load_dictr   r   
_load_datadata)selfr   r   r   r   r   r#   s          j/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/text/datasets/wmt16.py__init__zWMT16.__init__   ss    zz||  
 
 
 
 FtEE	
 
 

 JJLL	">!  P 8 88Xw DN 	q   "N   q   "N    ddllNN
 
 !ddllNN
 

 m<<DLLDDdM
 

 OO%%			    .	dict_sizereverseLiteral[True]dict[int, str]c                    d S N r2   r   r6   r7   s       r3   r/   zWMT16._load_dict   	     r5   Literal[False]c                    d S r;   r<   r=   s       r3   r/   zWMT16._load_dict   s	     r5   dict[int, str] | dict[str, int]c                    d S r;   r<   r=   s       r3   r/   zWMT16._load_dict   	     +.#r5   Fc                   t           j                            t          j        j        j        d| d| d          }d}t           j                            |          rMt          |d          5 }t          |
                                          |k    }d d d            n# 1 swxY w Y   |s|                     |||           i }t          |d          5 }t          |          D ]Z\  }	}
|r*|
                                                                ||	<   1|	||
                                                                <   [	 d d d            n# 1 swxY w Y   |S )Nwmt16/_.dictFrb)ospathjoinpaddledatasetcommon	DATA_HOMEexistsopenlen	readlines_build_dict	enumeratestripdecode)r2   r   r6   r7   	dict_path
dict_foundd	word_dictfdictidxlines              r3   r/   zWMT16._load_dict   s   GLLN!+,T,,I,,,
 
	 
7>>)$$ 	=i&& =! //9<
= = = = = = = = = = = = = = = 	9Y	4888	)T"" 	;e&u-- ; ;	T ;%)ZZ\\%8%8%:%:IcNN7:Idjjll113344	;	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; s%   -&BB#&B#A+EEErX   strc           	        t          t                    }t          j        | j        d          5 }|                    d          D ]}|                                }|                                                    d          }t          |          dk    rQ| j
        dk    r|d         n|d         }|                                D ]}	||	xx         dz  cc<   	 d d d            n# 1 swxY w Y   t          |d	          5 }
|
                    t           d
t           d
t           d
                                           t!          t#          |                                d d                    D ]R\  }}|dz   |k    r nC|
                    |d                                                    |
                    d           Sd d d            d S # 1 swxY w Y   d S )Nrr   zwmt16/train	   r   r      wb
c                    | d         S )Nre   r<   )xs    r3   <lambda>z#WMT16._build_dict.<locals>.<lambda>   s
    ! r5   T)keyr7         
)r   r   tarfilerQ   r   extractfilerW   rV   splitrR   r   write
START_MARKEND_MARKUNK_MARKencoderU   sorteditems)r2   rX   r6   r   r[   fr^   
line_splitsenwfoutr]   words                r3   rT   zWMT16._build_dict   sW     $$	\$.s333 	&qm44 & &{{}}!ZZ\\//55
z??a'''+yD'8'8jmmjm & &AaLLLA%LLLL&&	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& )T"" 	"dJJ:AAAAXAAAIIKKLLL&y((nndKKK  " "	T 7i''E

47>>++,,,

5!!!!	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	"s%   B*C''C+.C+CGGGc                z     j         t                   } j         t                   } j         t                    j        dk    rdnd}d|z
  }g  _        g  _        g  _        t          j	         j
        d          5 }|                    d j                   D ]}|                                }|                                                    d          }t!          |          dk    rR||                                         }|g fd	|D             z   |gz   }	||                                         }
 fd
|
D             }g ||}|g|} j                            |	            j                            |            j                            |           	 d d d            d S # 1 swxY w Y   d S )Nr   r   re   ra   rb   rE   rc   rd   c                F    g | ]}j                             |          S r<   )r   get.0r{   r2   unk_ids     r3   
<listcomp>z$WMT16._load_data.<locals>.<listcomp>  s+    GGGt}((F33GGGr5   c                F    g | ]}j                             |          S r<   )r   r   r   s     r3   r   z$WMT16._load_data.<locals>.<listcomp>	  s+    KKKA4=,,Q77KKKr5   )r   rr   rs   rt   r   r   r    r!   rn   rQ   r   ro   r   rW   rV   rp   rR   append)r2   start_idend_idsrc_coltrg_colrx   r^   ry   	src_wordsr   	trg_wordsr    r!   r   s   `            @r3   r0   zWMT16._load_data   s    =,x(x(yD((!!ag+\$.s333 	7q&:ty&:&:;; 7 7{{}}!ZZ\\//55
z??a''&w/5577	JGGGGGYGGGHh  'w/5577	KKKKKKKK11&1#.g.##G,,,##G,,,!((6666)7	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7s   =D%F00F47F4r]   Gtuple[npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_]]c                    t          j        | j        |                   t          j        | j        |                   t          j        | j        |                   fS r;   )nparrayr   r    r!   )r2   r]   s     r3   __getitem__zWMT16.__getitem__  sK     HT\#&''HT\#&''HT&s+,,
 	
r5   c                *    t          | j                  S r;   )rR   r   )r2   s    r3   __len__zWMT16.__len__  s    4<   r5   c                    d S r;   r<   r2   r   r7   s      r3   get_dictzWMT16.get_dict"  r>   r5   c                    d S r;   r<   r   s      r3   r   zWMT16.get_dict'  r>   r5   c                    d S r;   r<   r   s      r3   r   zWMT16.get_dict,  rC   r5   c                ,   || j         k    r| j        n| j        }t          j                            t          j        j        j	        d| d| d          }t          j        
                    |          s
J d            	 	 |                     ||          S )a  
        return the word dictionary for the specified language.

        Args:
            lang(string): A string indicating which language is the source
                          language. Available options are: "en" for English
                          and "de" for Germany.
            reverse(bool): If reverse is set to False, the returned python
                           dictionary will use word as key and use index as value.
                           If reverse is set to True, the returned python
                           dictionary will use index as key and word as value.

        Returns:
            dict: The word dictionary for the specific language.

        Examples:

            .. code-block:: python

                >>> from paddle.text.datasets import WMT16
                >>> wmt16 = WMT16(mode='train', src_dict_size=50, trg_dict_size=50)
                >>> en_dict = wmt16.get_dict('en')

        rE   rF   rG   z Word dictionary does not exist. )r   r   r   rI   rJ   rK   rL   rM   rN   rO   rP   r/   )r2   r   r7   r6   rX   s        r3   r   zWMT16.get_dict1  s    4 #'$)"3"3D9K 	 GLLN!+,T,,I,,,
 
	 w~~i((LL*LLL(I"tY///r5   )Nr   r"   r"   r   T)r   r   r   r   r   r   r   r   r   r   r#   r$   r%   r&   ).)r   r   r6   r   r7   r8   r%   r9   )r   r   r6   r   r7   r?   r%   r   )r   r   r6   r   r7   r$   r%   rA   )F)rX   r_   r6   r   r   r   r%   r&   )r%   r&   )r]   r   r%   r   )r%   r   )r   r   r7   r8   r%   r9   )r   r   r7   r?   r%   r   )r   r   r7   r$   r%   rA   )__name__
__module____qualname____doc____annotations__r4   r   r/   rT   r0   r   r   r   r<   r5   r3   r   r   .   s6        F FP !!!! !%")#*& *& *& *& *&X MP    X 
 #&	    X DG. . . . X.   *" " " "0#7 #7 #7 #7J
 
 
 
! ! ! ! =@    X >A    X 47. . . . X.$0 $0 $0 $0 $0 $0r5   r   )
__future__r   rI   rn   collectionsr   typingr   r   r   numpyr   rL   paddle.dataset.commonr   	paddle.ior	   numpy.typingnptr   r   __all__r*   r+   r-   r.   rr   rs   rt   r   r<   r5   r3   <module>r      s*   # " " " " " 				  # # # # # # 3 3 3 3 3 3 3 3 3 3      < < < < < <       ) 67Z(N
?-
g0 g0 g0 g0 g0G g0 g0 g0 g0 g0r5   