o
    ~ri'                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZ e eeZ!eeZ"ej#d	d
d e"D e$dZ%eee$ ej&dd'e! dee!df Z(eddZ)e)j*dg ddddddddej+ddf	dedededeee% ej&ddf dede(ded ed!ed"dfd#d$Z,e)j*d%d&d'gd			d<d(ee$ej-d)df d*ede(d!ed"df
d+d,Z.e)j*d-g d.dddej+ddfd(ee$ej-d)df d/eee$ ej&d0d1df d2eee$ ej&d3df ded ed!ed"dfd4d5Z/e)j*d6d7d8gdej+dfd6ee$ej-d9df ded!ed"dfd:d;Z0dS )=a  Contains commands to interact with datasets on the Hugging Face Hub.

Usage:
    # list datasets on the Hub
    hf datasets ls

    # list datasets with a search query
    hf datasets ls --search "code"

    # get info about a dataset
    hf datasets info HuggingFaceFW/fineweb
    N)	AnnotatedOptionalget_args)execute_raw_sql_query)CLIErrorRepositoryNotFoundErrorRevisionNotFoundError)DatasetSort_TExpandDatasetProperty_T   )	AuthorOpt	FilterOpt	FormatOptLimitOptOutputFormatQuietOptRevisionOpt	SearchOptTokenOptapi_object_to_dict
get_hf_apimake_expand_properties_parserprint_list_outputtyper_factoryDatasetSortEnumc                 C   s   i | ]}||qS  r   ).0sr   r   j/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/huggingface_hub/cli/datasets.py
<dictcomp>9   s    r   )typezComma-separated properties to return. When used, only the listed properties (and id) are returned. Example: '--expand=downloads,likes,tags'. Valid: z, .)helpcallbackz"Interact with datasets on the Hub.)r"   z	list | ls)zhf datasets lsz*hf datasets ls --sort downloads --limit 10zhf datasets ls --search "code")examples
   FsearchauthorfiltersortzSort results.limitexpandformatquiettokenreturnc	              	   C   sJ   t |d}	|r
|jnd}
dd |	j||| |
||dD }t|||d dS )zList datasets on the Hub.r.   Nc                 S   s   g | ]}t |qS r   )r   )r   dataset_infor   r   r   
<listcomp>a   s    zdatasets_ls.<locals>.<listcomp>)r(   r'   r&   r)   r*   r+   r,   r-   )r   valuelist_datasetsr   )r&   r'   r(   r)   r*   r+   r,   r-   r.   apisort_keyresultsr   r   r   datasets_lsH   s   
r9   infoz&hf datasets info HuggingFaceFW/finewebz9hf datasets info my-dataset --expand downloads,likes,tags
dataset_idz+The dataset ID (e.g. `username/repo-name`).revisionc              
   C   s   t |d}z
|j| ||d}W n, ty$ } z	td|  d|d}~w ty; } ztd| d|  d|d}~ww ttjt|d	d
 dS )z>Get info about a dataset on the Hub. Output is in JSON format.r0   )repo_idr<   r+   z	Dataset 'z' not found.Nz
Revision 'z' not found on 'z'.   )indent)	r   r1   r   r   r   printjsondumpsr   )r;   r<   r+   r.   r6   r:   er   r   r   datasets_infoo   s   
rD   parquet)z(hf datasets parquet cfahlgren1/hub-statsz8hf datasets parquet cfahlgren1/hub-stats --subset modelsz6hf datasets parquet cfahlgren1/hub-stats --split trainz6hf datasets parquet cfahlgren1/hub-stats --format jsonsubsetz--subsetz(Filter parquet entries by subset/config.splitz Filter parquet entries by split.c           
         sL   t |d}|j| |d} fdd|D }dd |D }	t|	||dd dS )	z/List parquet file URLs available for a dataset.r0   )r=   configc                    s"   g | ]} d u s|j  kr|qS )NrG   r   entryrI   r   r   r2      s   " z$datasets_parquet.<locals>.<listcomp>c                 S   s"   g | ]}|j |j|j|jd qS ))rF   rG   urlsize)rH   rG   rL   rM   rJ   r   r   r   r2      s    rL   )r,   r-   id_keyN)r   list_dataset_parquet_filesr   )
r;   rF   rG   r,   r-   r.   r6   entriesfilteredr8   r   rI   r   datasets_parquet   s   
rR   sqlzhf datasets sql "SELECT COUNT(*) AS rows FROM read_parquet('https://huggingface.co/api/datasets/cfahlgren1/hub-stats/parquet/models/train/0.parquet')"zhf datasets sql "SELECT * FROM read_parquet('https://huggingface.co/api/datasets/cfahlgren1/hub-stats/parquet/models/train/0.parquet') LIMIT 5" --format jsonzRaw SQL query to execute.c              
   C   sJ   zt | |d}W n ty } ztt||d}~ww t||dd dS )zAExecute a raw SQL query with DuckDB against dataset parquet URLs.)	sql_queryr.   NFr3   )r   ImportErrorr   strr   )rS   r,   r.   resultrC   r   r   r   datasets_sql   s   rX   )NNN)1__doc__enumrA   typingr   r   r   typerhuggingface_hub._dataset_viewerr   huggingface_hub.errorsr   r   r   huggingface_hub.hf_apir	   r
   
_cli_utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   sorted_EXPAND_PROPERTIES_SORT_OPTIONSEnumrV   r   Optionjoin	ExpandOptdatasets_clicommandtabler9   ArgumentrD   rR   rX   r   r   r   r   <module>   s   @
		
	
		