o
    di*                     @   s   d Z ddlZddlZddlZddlmZmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ erVdd
lmZmZ ddlmZ eeZ eeef Z!G dd deZ"dS )z@Base class to peform task-specific preprocessing and evaluation.    N)ABCabstractmethod)
TYPE_CHECKINGAnyCallableDictListOptionalSetTupleTypeUnion)PreTrainedTokenizerBase)BaseImageProcessor)requires_backends   )logging)DatasetDatasetDict)PretrainedConfigc                   @   sF  e Zd ZU eedf ed< eeeee	f f ed< eeef ed< e
e ed< ee ed< 	d,dd	d
edeeee	f  fddZdeeee	f  deeee	f eee	f f fddZe	d,deee	f deeef deee  deee	f fddZ	d,deeef deee  deeee	f geee	f f fddZ		d-ded deeef deee  dee ded f
ddZedee deeeef  fddZedee deee  fd d!Z			"	"		"	d.d#edeeeef  deee  d$ed%ed&ee d'eded fd(d)Z	"	"		"d/d$ed%ed&ee d'efd*d+ZdS )0TaskProcessor.ACCEPTED_PREPROCESSOR_CLASSESDEFAULT_DATASET_ARGSDEFAUL_DATASET_DATA_KEYSALLOWED_DATA_KEY_NAMESDEFAULT_REF_KEYSNconfigr   preprocessorpreprocessor_kwargsc                 C   sZ   t || jstdt| dddd | jD  d|| _|| _| |\| _| _	dS )a  
        Initializes the class in charge of loading processed datasets and of running evaluation.

        This class should be task-dependent, backend independent.

        Args:
            config (`PretrainedConfig`):
                The config of the model.
            preprocessor: (`Preprocessor`):
                The preprocessor associated to the model. This will be used to prepare the datasets.
            preprocessor_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`):
                Keyword arguments that will be passed to the preprocessor during dataset processing.
                This allows customizing the behavior of the preprocessor.
        z3Preprocessor is incorrect, provided an instance of z) but expected one of the following type: , c                 s   s    | ]}|j V  qd S N)__name__).0cls_ r$   n/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/optimum/utils/preprocessing/base.py	<genexpr>G   s    z)TaskProcessor.__init__.<locals>.<genexpr>.N)

isinstancer   
ValueErrortypejoinr   r   3create_defaults_and_kwargs_from_preprocessor_kwargsdefaultsr   )selfr   r   r   r$   r$   r%   __init__0   s   zTaskProcessor.__init__returnc                 C   s   |du ri }i t |fS )ah  
        Takes the dictionary of the preprocessor keyword arguments and return two dictionaries:
            - The first dictionary will either contain defaults values if not specified in preprocessor_kwargs or the
            values specified in preprocessor_kwargs.
            - The second dictionary will contain the rest of the keyword arguments.
        N)copydeepcopy)r.   r   r$   r$   r%   r,   P   s   	zATaskProcessor.create_defaults_and_kwargs_from_preprocessor_kwargsexample	data_keysref_keysc                 C      t dNz.This method must be implemented in subclasses.NotImplementedError)r.   r3   r4   r5   r$   r$   r%   dataset_processing_func]   s   z%TaskProcessor.dataset_processing_funcc                 C   s   t j| j||dS )Nr4   r5   )	functoolspartialr:   )r.   r4   r5   r$   r$   r%   create_dataset_processing_funcc   s   z,TaskProcessor.create_dataset_processing_funcdataset)r   r   splitc                 C   sT   t | dg ddlm} t||r|d urtd|d ur!|| }|| ||S )Ndatasetsr   )r   zIA Dataset and a split name were provided, but splits are for DatasetDict.)r   rA   r   r(   r)   mapr>   )r.   r?   r4   r5   r@   r   r$   r$   r%   prepare_dataseth   s   zTaskProcessor.prepare_datasetcolumn_namesc                 C   r6   r7   r8   r.   rD   r$   r$   r%   try_to_guess_data_keysx      z$TaskProcessor.try_to_guess_data_keysc                 C   r6   r7   r8   rE   r$   r$   r%   try_to_guess_ref_keys|   rG   z#TaskProcessor.try_to_guess_ref_keysFpathonly_keep_necessary_columnsload_smallest_splitnum_samplesshufflec	                    sz  t | dg ddlm}
m}m} ddlm} ddlm} |d u r&|| d}||fi |	d|i}t||rd|rd|		dd }|d urKt
d	| d
t| dd dd }td| d || }|rj| }t||
r|d ur||jkrt
d|j d| d|t|}|j}t|trtttj| }|d u rtd | |}|d u rt
dd| td|  nt| | jkst
dt|  d| j d|d u r|  |}td| d | j!|||d}|r;|d ur|ng }| j"j#|  t||r,| D ]\}} fdd|jD }|$|||< q|S  fdd|jD }|$|}|S )NrA   r   )r   r   DownloadConfig)load_dataset)http_user_agent)
user_agentdownload_configr@   zA split name was provided (z?) but load_smallest_split is True, use either one or the other.c                 S   s
   | d j S )N   )num_rows)itemr$   r$   r%   <lambda>   s   
 z,TaskProcessor.load_dataset.<locals>.<lambda>)keyzvSince no split was explicitely provided and load_smallest_split=True, using the smallest split of the dataset called "z".zThere are only z9 examples in the dataset, but it was requested to select z
 examples.z4As no data keys were provided, trying to guess them.zMData keys need to be specified manually since they could not be guessed from r   z!Guessed the following data keys: z"data_keys contains unallowed keys z, allowed_keys: r'   z3As no ref keys were provided, tried to guess them: r;   c                       g | ]}| vr|qS r$   r$   r"   namenecessary_columnsr$   r%   
<listcomp>       z.TaskProcessor.load_dataset.<locals>.<listcomp>c                    rX   r$   r$   rY   r[   r$   r%   r]      r^   )%r   rA   r   r   rN   rO   transformers.utilsrP   r(   getr)   minitemsloggerinforM   rT   selectrangerD   dictlistset	itertoolschainfrom_iterablevalueswarningrF   r+   keysr   rH   rC   r   model_input_namesremove_columns)r.   rI   r4   r5   rJ   rK   rL   rM   rR   load_dataset_kwargsr   r   rN   datasets_load_datasetrP   r?   r@   smallest_splitrD   
split_namecolumns_to_remover$   r[   r%   rO      s|   







zTaskProcessor.load_datasetc           	   	   K   s   t | jtrA| jdd }|d u rtdt| j t| @ }|r.d| t	d t
|}|dd | j D  n| j}|}| j|f| j| j||||d|S )NrI   zWhen DEFAULT_DATASET_ARGS is a dictionary, it must contain a key called "path" corresponding to the path or name of the dataset.r   zThe following provided arguments will be overriden because they are hardcoded when using load_default_dataset: {override_config_key}.c                 S   s   i | ]\}}|d kr||qS )rI   r$   )r"   kvr$   r$   r%   
<dictcomp>   s    z6TaskProcessor.load_default_dataset.<locals>.<dictcomp>)r4   r5   rJ   rK   rL   rM   )r(   r   rg   r`   r)   ri   ro   r+   rc   rn   r1   r2   updaterb   rO   r   r   )	r.   rJ   rK   rL   rM   rr   rI   common_keyskwargsr$   r$   r%   load_default_dataset   s8   

z"TaskProcessor.load_default_datasetr    )NN)NNFFNFN)FFNF)r!   
__module____qualname__r   r   __annotations__r   strr   r   r
   r   Preprocessorr	   r/   r,   r   r:   r   r>   rC   rF   rH   boolintrO   r}   r$   r$   r$   r%   r   )   s   
 
 







	


$ 

Wr   )#__doc__r1   r<   rj   abcr   r   typingr   r   r   r   r   r	   r
   r   r   r   transformersr   #transformers.image_processing_utilsr   optimum.utils.import_utilsr    r   rA   r   r   r   
get_loggerr!   rc   r   r   r$   r$   r$   r%   <module>   s    0
