
    |j5                        d dl mZ d dlmZ er
d dlZd dlmZ d dlZd dl	Z	d dlZd dl
mZ d dlmZ g ZdZdZdZd	Zd
ZdZdZdZdZdZd Z G d de          ZdS )    )annotations)TYPE_CHECKINGN)_check_exists_and_download)DatasetzBhttp://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz 387719152ae52d60422c016e92a742fcz:http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt ea7fb7d4c75cc6254716f0177a506baaz:http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt 0d2977293bbb6cbefab5b0f97db1e77cz<http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt d8c7f03ceb5fc2e5a0fa7503a4353751z1http://paddlemodels.bj.bcebos.com/conll05st%2Femb bf436eb0faa1f6f9103017f8be57cdb7c                      e Zd ZU dZded<   ded<   ded<   ded<   ded<   ded	<   ded
<   ded<   ded<   ded<   ded<   	 	 	 	 	 	 d$d%dZd&dZd&dZd'dZd(dZ	d)d Z
d*d"Zd+d#ZdS ),	Conll05sta	  
    Implementation of `Conll05st <https://www.cs.upc.edu/~srlconll/soft.html>`_
    test dataset.

    Note: only support download test dataset automatically for that
          only test dataset of Conll05st is public.

    Args:
        data_file(str|None): path to data tar file, can be set None if
            :attr:`download` is True. Default None
        word_dict_file(str|None): path to word dictionary file, can be set None if
            :attr:`download` is True. Default None
        verb_dict_file(str|None): path to verb dictionary file, can be set None if
            :attr:`download` is True. Default None
        target_dict_file(str|None): path to target dictionary file, can be set None if
            :attr:`download` is True. Default None
        emb_file(str|None): path to embedding dictionary file, only used for
            :code:`get_embedding` can be set None if :attr:`download` is
            True. Default None
        download(bool): whether to download dataset automatically if
            :attr:`data_file` :attr:`word_dict_file` :attr:`verb_dict_file`
            :attr:`target_dict_file` is not set. Default True

    Returns:
        Dataset: instance of conll05st dataset

    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.text.datasets import Conll05st

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, pred_idx, mark, label):
            ...         return paddle.sum(pred_idx), paddle.sum(mark), paddle.sum(label)


            >>> conll05st = Conll05st()

            >>> for i in range(10):
            ...     pred_idx, mark, label= conll05st[i][-3:]
            ...     pred_idx = paddle.to_tensor(pred_idx)
            ...     mark = paddle.to_tensor(mark)
            ...     label = paddle.to_tensor(label)
            ...
            ...     model = SimpleNet()
            ...     pred_idx, mark, label= model(pred_idx, mark, label)
            ...     print(pred_idx.item(), mark.item(), label.item())
            >>> # doctest: +SKIP('label will change')
            65840 5 1991
            92560 5 3686
            99120 5 457
            121960 5 3945
            4774 5 2378
            14973 5 1938
            36921 5 1090
            26908 5 2329
            62965 5 2968
            97755 5 2674

    
str | None	data_fileword_dict_fileverb_dict_filetarget_dict_fileemb_filedict[str, int]	word_dictpredicate_dict
label_dictlist	sentences
predicateslabelsNTdownloadboolc                @   || _         | j         .|s
J d            t          |t          t          d|          | _         || _        | j        .|s
J d            t          |t
          t          d|          | _        || _        | j        .|s
J d            t          |t          t          d|          | _        || _
        | j
        .|s
J d            t          |t          t          d|          | _
        || _        | j        .|s
J d            t          |t          t          d|          | _        |                     | j                  | _        |                     | j                  | _        |                     | j
                  | _        |                                  d S )Nz>data_file is not set and downloading automatically is disabled	conll05stzCword_dict_file is not set and downloading automatically is disabledzCverb_dict_file is not set and downloading automatically is disabledzEtarget_dict_file is not set and downloading automatically is disabledz=emb_file is not set and downloading automatically is disabled)r   r   DATA_URLDATA_MD5r   WORDDICT_URLWORDDICT_MD5r   VERBDICT_URLVERBDICT_MD5r   TRGDICT_URLTRGDICT_MD5r   EMB_URLEMB_MD5
_load_dictr   r   _load_label_dictr   
_load_anno)selfr   r   r   r   r   r   s          l/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/text/datasets/conll05.py__init__zConll05st.__init__}   s    #>!  P 8 88X{H DN -&  U 8 #=# #D -&  U 8 #=# #D !1 (  W 8 %? % %D! !=   O 8 7'7K DM )<=="ood.ABB//0EFF 	    filenamestrreturnc                   i }t                      }t          |d          5 }t          |          D ]~\  }}|                                }|                    d          r|                    |dd                     L|                    d          r|                    |dd                     d}|D ]}||d|z   <   |dz  }||d|z   <   |dz  }||d<   d d d            n# 1 swxY w Y   |S )NrB-   I-r      O)setopen	enumeratestrip
startswithadd)	r-   r1   dtag_dictfilineindextags	            r.   r+   zConll05st._load_label_dict   sJ   55(C   	A$Q<< + +4zz||??4(( +LLabb****__T** +LLabb***E   %$*
 %$*
AcF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 s   B5C""C&)C&c                    i }t          |d          5 }t          |          D ]\  }}|||                                <   	 d d d            n# 1 swxY w Y   |S )Nr5   )r<   r=   r>   )r-   r1   rA   rC   rD   rE   s         r.   r*   zConll05st._load_dict   s    (C   	$A$Q<< $ $4"#$**,,$	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ s   -AAANonec           
         t          j        | j                  }|                    d          }|                    d          }g | _        g | _        g | _        t          j        |          5 }t          j        |          5 }g }g }g }t          ||          D ]\  }	}
|	
                                                                }	|

                                                                                                }
t          |
          dk    rSt          t          |d                             D ]%fd|D             }|                    |           &t          |          dk    rg }|d         D ]}|dk    r|                    |           t!          |dd                    D ]\  }d}d	}g }d
}|D ]H}|dk    r|s|                    d           !|dk    r|r|                    d|z              B|dk    r|                    d|z              d	}c|                    d          dk    rQ|                    d          dk    r8|d|                    d                   }|                    d|z              d	}|                    d          dk    rR|                    d          dk    r9|d|                    d                   }|                    d|z              d}8t%          d|           | j                            |           | j                            |                    | j                            |           g }g }g }|                    |	           |                    |
           	 d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |                                 |                                 |                                 d S )Nz2conll05st-release/test.wsj/words/test.wsj.words.gzz2conll05st-release/test.wsj/props/test.wsj.props.gz)fileobjr   c                     g | ]
}|         S  rM   ).0xrD   s     r.   
<listcomp>z(Conll05st._load_anno.<locals>.<listcomp>   s    '>'>'>!'>'>'>r0   r9   -r:   F *r8   z*)()r6   TzUnexpected label: )tarfiler<   r   extractfiler   r   r   gzipGzipFilezipr>   decodesplitlenrangeappendr=   findRuntimeErrorclose)r-   tfwfpf
words_file
props_filer   r   one_segwordlabela_kind_label	verb_listrO   lblcur_tagis_in_bracketlbl_seq	verb_wordlrD   s                       @r.   r,   zConll05st._load_anno   st   \$.))^^@
 
 ^^@
 
 M"%%%7	*)3M"%%%7	*)3IFG":z:: 0* 0*ezz||**,,,,..4466u::??"3wqz??33 4 4'>'>'>'>g'>'>'>l33336{{a''$&	!' 4 4A Cxx ) 0 0 3 3 3&/qrr
&;&; 8 8FAs&)G,1M&(G(*I%( Q Q#$88M8$+NN3$7$7$7$7%&#XX-X$+NN4'>$B$B$B$B%&$YY$+NN4'>$B$B$B49MM%&VVC[[B%6%6166#;;";L;L./AFF3KK.@G$+NN4'>$B$B$B49MM%&VVC[[B%6%6166#;;";L;L./AFF3KK.@G$+NN4'>$B$B$B48MM*67OA7O7O*P*P$P N11)<<< O229Q<@@@ K..w7777 "IF GG$$T***NN5))))a0*7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	* 7	*r 	












s7   /N7LN N7 N$	$N7'N$	(N77N;>N;idxinttuple[npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_], npt.NDArray[np.int_]]c                0     j         |         } j        |         } j        |         }t          |          }|                    d          }dgt          |          z  }|dk    rd||dz
  <   ||dz
           }nd}|dk    rd||dz
  <   ||dz
           }	nd}	d||<   ||         }
|t          |          dz
  k     rd||dz   <   ||dz            }nd}|t          |          dz
  k     rd||dz   <   ||dz            }nd} fd|D             } j                            |	t                    g|z  } j                            |t                    g|z  } j                            |
t                    g|z  } j                            |t                    g|z  } j                            |t                    g|z  } j                            |          g|z  } fd|D             }t          j
        |          t          j
        |          t          j
        |          t          j
        |          t          j
        |          t          j
        |          t          j
        |          t          j
        |          t          j
        |          f	S )	NzB-Vr   r9   bosr7   eosc                P    g | ]"}j                             |t                    #S rM   )r   getUNK_IDXrN   wr-   s     r.   rP   z)Conll05st.__getitem__.<locals>.<listcomp>X  s+    EEEqDN&&q'22EEEr0   c                D    g | ]}j                             |          S rM   )r   r{   r}   s     r.   rP   z)Conll05st.__getitem__.<locals>.<listcomp>a  s)    <<<T_((++<<<r0   )r   r   r   r^   rF   r   r{   r|   r   nparray)r-   rt   sentence	predicater   sen_len
verb_indexmarkctx_n1ctx_n2ctx_0ctx_p1ctx_p2word_idx
ctx_n2_idx
ctx_n1_idx	ctx_0_idx
ctx_p1_idx
ctx_p2_idxpred_idx	label_idxs   `                    r.   __getitem__zConll05st.__getitem__(  s    >#&OC(	S!h--\\%((
sS[[ >>#$Da j1n-FFF>>#$Da j1n-FFFZ$Fa''#$Da j1n-FFFFa''#$Da j1n-FFFEEEEHEEEn((99:WD
n((99:WD
^''w7787B	n((99:WD
n((99:WD
'++I667'A<<<<V<<<	 HXHZ  HZ  HYHZ  HZ  HXHTNNHY

 
	
r0   c                *    t          | j                  S )N)r^   r   r-   s    r.   __len__zConll05st.__len__o  s    4>"""r0   5tuple[dict[str, int], dict[str, int], dict[str, int]]c                *    | j         | j        | j        fS )aD  
        Get the word, verb and label dictionary of Wikipedia corpus.

        Examples:

            .. code-block:: python

                >>> from paddle.text.datasets import Conll05st

                >>> conll05st = Conll05st()
                >>> word_dict, predicate_dict, label_dict = conll05st.get_dict()

        )r   r   r   r   s    r.   get_dictzConll05st.get_dictr  s     ~t2DOCCr0   c                    | j         S )a  
        Get the embedding dictionary file.

        Examples:

            .. code-block:: python

                >>> from paddle.text.datasets import Conll05st

                >>> conll05st = Conll05st()
                >>> emb_file = conll05st.get_embedding()

        )r   r   s    r.   get_embeddingzConll05st.get_embedding  s     }r0   )NNNNNT)r   r   r   r   r   r   r   r   r   r   r   r   )r1   r2   r3   r   )r3   rI   )rt   ru   r3   rv   )r3   ru   )r3   r   )r3   r2   )__name__
__module____qualname____doc____annotations__r/   r+   r*   r,   r   r   r   r   rM   r0   r.   r   r   .   sp        @ @D     """"OOOLLL !%%)%)'+#G G G G GR   &   F F F FPE
 E
 E
 E
N# # # #D D D D      r0   r   )
__future__r   typingr   numpyr   numpy.typingnptrY   rW   paddle.dataset.commonr   	paddle.ior   __all__r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r|   r   rM   r0   r.   <module>r      s   # " " " " "                    < < < < < <      
O-K1K1L0
=
,
b b b b b b b b b br0   