
    |j#                        d dl mZ d dlZd dlZd dlmZmZmZ d dlZ	d dl
mZ d dlmZ erd dlmZ ed         Zg Zg dZdZd	Z G d
 d          Z G d d          Z G d de          ZdS )    )annotationsN)TYPE_CHECKINGAnyLiteral)_check_exists_and_download)Datasettraintest)         #   -   2   8   z3https://dataset.bj.bcebos.com/movielens%2Fml-1m.zip c4d9eecfca2ab87c1945afe126590906c                  P    e Zd ZU dZded<   ded<   ded<   dd
Zd ZddZddZdS )	MovieInfozM
    Movie id, title and categories information are stored in MovieInfo.
    intindex	list[str]
categoriesstrtitlereturnNonec                J    t          |          | _        || _        || _        d S N)r   r   r   r   )selfr   r   r   s       n/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/text/datasets/movielens.py__init__zMovieInfo.__init__.   s!    ZZ
$


    c                    | j         gfd| j        D             fd| j                                        D             gS )z/
        Get information from a movie.
        c                     g | ]
}|         S  r&   ).0ccategories_dicts     r!   
<listcomp>z#MovieInfo.value.<locals>.<listcomp>9   s    999A_Q999r#   c                D    g | ]}|                                          S r&   )lower)r'   wmovie_title_dicts     r!   r*   z#MovieInfo.value.<locals>.<listcomp>:   s'    EEEQaggii(EEEr#   )r   r   r   split)r    r)   r.   s    ``r!   valuezMovieInfo.value3   sU    
 ZL9999999EEEE$*2B2B2D2DEEE
 	
r#   c                8    d| j          d| j         d| j         dS )Nz<MovieInfo id(z	), title(z), categories()>)r   r   r   r    s    r!   __str__zMovieInfo.__str__=   s)    b
bbTZbbtbbbbr#   c                *    |                                  S r   )r4   r3   s    r!   __repr__zMovieInfo.__repr__@   s    ||~~r#   N)r   r   r   r   r   r   r   r   r   r   	__name__
__module____qualname____doc____annotations__r"   r0   r4   r6   r&   r#   r!   r   r   %   s           JJJJJJ   

 
 
c c c c     r#   r   c                  Z    e Zd ZU dZded<   ded<   ded<   ded<   ddZd ZddZddZdS )UserInfozK
    User id, gender, age, and job information are stored in UserInfo.
    r   r   boolis_maleagejob_idr   genderr   r   c                    t          |          | _        |dk    | _        t                              t          |                    | _        t          |          | _        d S )NM)r   r   rA   	age_tablerB   rC   )r    r   rD   rB   rC   s        r!   r"   zUserInfo.__init__N   sD    ZZ
}??3s88,,&kkr#   c                F    | j         g| j        rdndg| j        g| j        ggS )z.
        Get information from a user.
        r   r   )r   rA   rB   rC   r3   s    r!   r0   zUserInfo.valueT   s4    
 ZL,%QQA&XJ[M	
 	
r#   c           	     j    | j         rdnd}d| j         d| dt          | j                  d| j         d	S )NrF   Fz<UserInfo id(z
), gender(z), age(z), job(r2   )rA   r   rG   rB   rC   )r    rD   s     r!   r4   zUserInfo.__str___   sH    -#otzooVooIdhDWoo`d`koooor#   c                     t          |           S r   )r   r3   s    r!   r6   zUserInfo.__repr__c   s    4yyr#   N)
r   r   rD   r   rB   r   rC   r   r   r   r7   r8   r&   r#   r!   r?   r?   D   s           JJJMMMHHHKKK" " " "	
 	
 	
p p p p     r#   r?   c                      e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   ded<   ded<   ded<   ded<   	 	 	 	 	 d#d$dZd%dZd%dZd&d!Zd'd"Z	dS )(	Movielensa  
    Implementation of `Movielens 1-M <https://grouplens.org/datasets/movielens/1m/>`_ dataset.

    Args:
        data_file(str|None): path to data tar file, can be set None if
            :attr:`download` is True. Default None.
        mode(str): 'train' or 'test' mode. Default 'train'.
        test_ratio(float): split ratio for test sample. Default 0.1.
        rand_seed(int): random seed. Default 0.
        download(bool): whether to download dataset automatically if
            :attr:`data_file` is not set. Default True.

    Returns:
        Dataset: instance of Movielens 1-M dataset.

    Examples:

        .. code-block:: pycon

            >>> # doctest: +TIMEOUT(75)
            >>> import paddle
            >>> from paddle.text.datasets import Movielens

            >>> class SimpleNet(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...
            ...     def forward(self, category, title, rating):
            ...         return paddle.sum(category), paddle.sum(title), paddle.sum(rating)


            >>> movielens = Movielens(mode='train')

            >>> for i in range(10):
            ...     category, title, rating = movielens[i][-3:]
            ...     category = paddle.to_tensor(category)
            ...     title = paddle.to_tensor(title)
            ...     rating = paddle.to_tensor(rating)
            ...
            ...     model = SimpleNet()
            ...     category, title, rating = model(category, title, rating)
            ...     print(category.shape, title.shape, rating.shape)
            paddle.Size([]) paddle.Size([]) paddle.Size([])
            paddle.Size([]) paddle.Size([]) paddle.Size([])
            paddle.Size([]) paddle.Size([]) paddle.Size([])
            paddle.Size([]) paddle.Size([]) paddle.Size([])
            paddle.Size([]) paddle.Size([]) paddle.Size([])
            paddle.Size([]) paddle.Size([]) paddle.Size([])
            paddle.Size([]) paddle.Size([]) paddle.Size([])
            paddle.Size([]) paddle.Size([]) paddle.Size([])
            paddle.Size([]) paddle.Size([]) paddle.Size([])
            paddle.Size([]) paddle.Size([]) paddle.Size([])
    _MovieLensDataSetModemode
str | None	data_filefloat
test_ratior   	rand_seedzdict[int, MovieInfo]
movie_infozdict[str, int]r.   r)   zdict[int, UserInfo]	user_infozlist[list[float]]dataNr
   皙?r   Tdownloadr@   r   r   c                   |                                 dv sJ d|             |                                 | _        || _        | j        .|s
J d            t          |t          t
          d|          | _        || _        || _        t          j	        
                    |           |                                  |                                  d S )Nr	   z(mode should be 'train', 'test', but got z>data_file is not set and downloading automatically is disabled	sentiment)r,   rO   rQ   r   URLMD5rS   rT   nprandomseed_load_meta_info
_load_data)r    rQ   rO   rS   rT   rY   s         r!   r"   zMovielens.__init__   s     zz||  
 
 
 
 =d<<
 
 
 JJLL	">!  P 8 83[( DN %"
	y!!!r#   c           
        t          j        d          }i | _        i | _        i | _        i | _        t          j        | j                  5 }|	                                D ]Y}t          |t          j                  sJ t                      }t                      }|                    d          5 }t          |          D ]\  }}|                    d          }|                                                    d          \  }	}
}|                    d          }|D ]}|                    |           |                    |
                              d          }
t+          |	||
          | j        t-          |	          <   |
                                D ])}|                    |                                           *	 d d d            n# 1 swxY w Y   t          |          D ]\  }}|| j        |<   t          |          D ]\  }}|| j        |<   |                    d	          5 }|D ]m}|                    d          }|                                                    d          \  }}}}}t1          ||||
          | j        t-          |          <   n	 d d d            n# 1 swxY w Y   [	 d d d            d S # 1 swxY w Y   d S )Nz^(.*)\((\d+)\)$zml-1m/movies.datlatinencoding::|r   )r   r   r   zml-1m/users.dat)r   rD   rB   rC   )recompilerU   r.   r)   rV   zipfileZipFilerQ   infolist
isinstanceZipInfosetopen	enumeratedecodestripr/   addmatchgroupr   r   r,   r?   )r    patternpackageinfotitle_word_setcategories_set
movie_fileilinemovie_idr   r   r(   r-   	user_fileuidrD   rB   job_s                       r!   ra   zMovielens._load_meta_info   s   */00 "!_T^,, 	((**  !$88888!$!$\\"455 :#,Z#8#8 : :4#{{G{<<6:jjll6H6H6N6N3%%/%5%5c%:%:
!+ 2 2A*..q1111 'e 4 4 : :1 = =9B"*z: : :H6 "' : :A*..qwwyy9999::: : : : : : : : : : : : : : : &n55 1 1DAq/0D)!,,%n55 0 0DAq./D(++\\"344 	 )  #{{G{<<37::<<3E3Ed3K3K0VS#q3;"%f#c4 4 4s3xx00              1	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s^   
A%K/DGKGKGAK/A1J-!K-J11K4J15KKKc           
        g | _         | j        dk    }t          j        | j                  5 }|                    d          5 }|D ]}|                    d          }t          j                                        | j	        k     |k    r|
                                                    d          \  }}}}t          |          }t          |          }t          |          dz  dz
  }| j        |         }| j        |         }	| j                             |	                                |                    | j        | j                  z   |ggz              	 d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )Nr   zml-1m/ratings.datrd   re   rg      g      @)rW   rO   rk   rl   rQ   rq   rs   r^   r_   rS   rt   r/   r   rR   rU   rV   appendr0   r)   r.   )
r    is_testry   ratingr   r   mov_idr   movusrs
             r!   rb   zMovielens._load_data   s   	)v%ODN++	/6LL,--	17  {{G{44I$$&&8WDD-1ZZ\\-?-?-E-E*Cc((C [[F"6]]Q.4F/&1C.-CI$$		))D$8$:OPPQ"8*%  		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s6   E;DE#E;#E'	'E;*E'	+E;;E?E?idxtuple[npt.NDArray[Any], ...]c                N    | j         |         }t          d |D                       S )Nc                6    g | ]}t          j        |          S r&   )r^   array)r'   ds     r!   r*   z)Movielens.__getitem__.<locals>.<listcomp>  s     000abhqkk000r#   )rW   tuple)r    r   rW   s      r!   __getitem__zMovielens.__getitem__  s*    y~004000111r#   c                *    t          | j                  S r   )lenrW   r3   s    r!   __len__zMovielens.__len__  s    49~~r#   )Nr
   rX   r   T)rQ   rP   rO   rN   rS   rR   rT   r   rY   r@   r   r   )r   r   )r   r   r   r   )r   r   )
r9   r:   r;   r<   r=   r"   ra   rb   r   r   r&   r#   r!   rM   rM   g   s        4 4l  NNN$$$$$$$$####"""" !%&-    <% % % %N   .2 2 2 2     r#   rM   )
__future__r   ri   rk   typingr   r   r   numpyr^   paddle.dataset.commonr   	paddle.ior   numpy.typingnptrN   __all__rG   r\   r]   r   r?   rM   r&   r#   r!   <module>r      sG   # " " " " " 				  . . . . . . . . . .     < < < < < <       5#O4
'''	;(       >               Fb b b b b b b b b br#   