
    |j@                        d dl mZ d dlZd dlZd dlmZmZ d dlZd dl	m
Z
 d dlmZ erd dlmZ ed         Zed         Zg ZdZd	Z G d
 de          ZdS )    )annotationsN)TYPE_CHECKINGLiteral)_check_exists_and_download)DatasetNGRAMSEQtraintestz<https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgz 30177ea32e27c525793142b6bf2c8e2dc                      e Zd ZU dZded<   ded<   ded<   ded	<   ded
<   ded<   	 	 	 	 	 	 d d!dZd"dZd#dZd$dZd%dZ	d&dZ
dS )'Imikolova  
    Implementation of imikolov dataset.

    Args:
        data_file(str|None): path to data tar file, can be set None if
            :attr:`download` is True. Default None.
        data_type(str): 'NGRAM' or 'SEQ'. Default 'NGRAM'.
        window_size(int): sliding window size for 'NGRAM' data. Default -1.
        mode(str): 'train' 'test' mode. Default 'train'.
        min_word_freq(int): minimal word frequencies for building word dictionary. Default 50.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True

    Returns:
        Dataset: instance of imikolov dataset

    Examples:

        .. code-block:: python

            >>> # doctest: +TIMEOUT(60)
            >>> import paddle
            >>> from paddle.text.datasets import Imikolov

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, src, trg):
            ...         return paddle.sum(src), paddle.sum(trg)


            >>> imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2)

            >>> for i in range(10):
            ...     src, trg = imikolov[i]
            ...     src = paddle.to_tensor(src)
            ...     trg = paddle.to_tensor(trg)
            ...
            ...     model = SimpleNet()
            ...     src, trg = model(src, trg)
            ...     print(src.item(), trg.item())
            2076 2075
            2076 2075
            675 674
            4 3
            464 463
            2076 2075
            865 864
            2076 2075
            2076 2075
            1793 1792

    
str | None	data_file_ImikolovDataType	data_typeintwindow_size_ImikolovDataSetModemodemin_word_freqdict[str, int]word_idxNr	   r   2   TdownloadboolreturnNonec                   |                                 dv sJ d|             |                                 | _        |                                dv sJ d|             |                                | _        || _        || _        || _        | j        .|s
J d            t          |t          t          d|          | _        | 
                    |          | _        |                                  d S )Nr   z,data type should be 'NGRAM', 'SEQ', but got r   z(mode should be 'train', 'test', but got z;data_file is not set and downloading automatically disabledimikolov)upperr   lowerr   r   r   r   r   URLMD5_build_work_dictr   
_load_anno)selfr   r   r   r   r   r   s          m/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/text/datasets/imikolov.py__init__zImikolov.__init__c   s)       %
 
 
 
 F)EE
 
 
 #**zz||  
 
 
 
 =d<<
 
 
 JJLL	&*">!  M 8 83Z DN
 --m<< 	    c                    |t          j        t                    }|D ][}|                                                                D ]}||xx         dz  cc<   |dxx         dz  cc<   |dxx         dz  cc<   \|S )N   <s><e>)collectionsdefaultdictr   stripsplit)r*   f	word_freqlws        r+   
word_countzImikolov.word_count   s    #/44I 	" 	"AWWYY__&& " "!!e!e!r-   cutoffc                h    d}d}t          j         j                  5 }|                    |          }|                    |          }                     |                     |                    }d|v r|d=  fd|                                D             }t          |d           }t          t          |           \  }	}
t          t          t          |	t          t          |	                                                  }t          |	          |d<   d d d            n# 1 swxY w Y   |S )Nz$./simple-examples/data/ptb.train.txtz$./simple-examples/data/ptb.valid.txt<unk>c                6    g | ]}|d          j         k    |S )r/   )r   ).0xr*   s     r+   
<listcomp>z-Imikolov._build_work_dict.<locals>.<listcomp>   s1       !t7I0I0I0I0I0Ir-   c                $    | d          | d         fS )Nr/   r    )r@   s    r+   <lambda>z+Imikolov._build_work_dict.<locals>.<lambda>   s    1qt} r-   )key)tarfileopenr   extractfiler:   itemssortedlistzipdictrangelen)r*   r;   train_filenametest_filenametftrainftestfr7   word_freq_sortedwords_r   s   `           r+   r(   zImikolov._build_work_dict   sk   ?>\$.)) 	+R^^N33FNN=11Etv/F/FGGI)##g&   $??,,  I  &i5L5LMMMC!1233HE1DUE#e**,=,=!>!>??@@H #E

HW	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+" s   C<D''D+.D+c           
         g  _         t          j         j                  5 }d j         d}|                    |          } j        d         |D ]} j        dk    rǉ j        dk    s
J d            dg|	                                
                                d}t          |           j        k    rn fd	|D             }t           j        t          |          d
z             D ]9} j                             t          || j        z
  |                              :Չ j        dk    r|	                                
                                } fd|D             } j        d         g|}g | j        d         } j        dk    rt          |           j        k    r[ j                             ||f           yt          d          	 d d d            d S # 1 swxY w Y   d S )Nz./simple-examples/data/ptb.z.txtr=   r	   r   zInvalid gram lengthr0   r1   c                F    g | ]}j                             |          S rC   r   getr?   r9   UNKr*   s     r+   rA   z'Imikolov._load_anno.<locals>.<listcomp>   s+    BBB1T]..q#66BBBr-   r/   r
   c                F    g | ]}j                             |          S rC   rZ   r\   s     r+   rA   z'Imikolov._load_anno.<locals>.<listcomp>   s+    >>>q**1c22>>>r-   r   zUnknown data type)datarF   rG   r   r   rH   r   r   r   r4   r5   rO   rN   appendtupleAssertionError)	r*   rR   filenamer6   r8   isrc_seqtrg_seqr]   s	   `       @r+   r)   zImikolov._load_anno   sV   	\$.)) 	>RDTYDDDHx((A-(C > >>W,,+b0002G000:!2!2:E:A1vv!111BBBBBBBB!&t'7Q!!D!D Q QA I,,U1Q9I5IA5M3N-O-OPPPP^u,,		))A>>>>>A>>>A#}U38a8G884=#78G'!++Gt?O0O0O I$$gw%78888()<===#>	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s   F8G))G-0G-idx1tuple[npt.NDArray[np.int_], npt.NDArray[np.int_]]c                J    t          d | j        |         D                       S )Nc                6    g | ]}t          j        |          S rC   )nparray)r?   ds     r+   rA   z(Imikolov.__getitem__.<locals>.<listcomp>   s     :::abhqkk:::r-   )ra   r_   )r*   rg   s     r+   __getitem__zImikolov.__getitem__   s'     ::49S>:::;;;r-   c                *    t          | j                  S N)rO   r_   )r*   s    r+   __len__zImikolov.__len__   s    49~~r-   )Nr	   r   r   r   T)r   r   r   r   r   r   r   r   r   r   r   r   r    r!   rp   )r;   r   r    r   )r    r!   )rg   r   r    rh   )r    r   )__name__
__module____qualname____doc____annotations__r,   r:   r(   r)   rn   rq   rC   r-   r+   r   r   $   s         5 5n      !%'.%,% % % % %N
 
 
 
   ,> > > >4< < < <
     r-   r   )
__future__r   r2   rF   typingr   r   numpyrk   paddle.dataset.commonr   	paddle.ior   numpy.typingnptr   r   __all__r&   r'   r   rC   r-   r+   <module>r      s    # " " " " "      ) ) ) ) ) ) ) )     < < < < < <       4/"?3
D(h h h h hw h h h h hr-   