o
    diE$                     @   sr  d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
Z
mZ e r)d dlmZ 	 	d&deeeejf  dedee fd	d
Z		d'deeeejf  dedee deeejf fddZd(dedededefddZd(dedededefddZd(dedededefddZd(dedededefddZd(dedededefddZ	d)d"edededed#edefd$d%ZdS )*    N)AnyDictListOptional)DATASETS_IMPORT_ERRORis_datasets_available)load_dataset   examples
batch_sizepad_token_idc                    sx   g | D ]}|d }|d } t|t|d q dkr)du r)td fddtd	t D S )
a  
    Prepare the dataset by making sure that we have the right format and `batch_size`
    Args:
        examples (`List[Dict[str, torch.LongTensor]]`):
            List of data to prepare
        batch_size (`int`, defaults to `1`):
            Batch size of the data
        pad_token_id (`Optional[int]`, defaults to `None`):
            Pad token id of the model
    Returns:
        ` List[Dict[str, torch.LongTensor]]`: Batched dataset
    	input_idsattention_maskr   r   r	   NzfYou need to pass a `pad_token_id` in `quantize_model` if you want to have examples with batch size > 1c                    s&   g | ]}t ||   d dqS )F)contain_labelsr   )collate_data).0startr   new_examplesr    _/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/optimum/gptq/data.py
<listcomp><   s    z#prepare_dataset.<locals>.<listcomp>r   )appendtorch
LongTensor
ValueErrorrangelen)r
   r   r   exampler   r   r   r   r   prepare_dataset"   s   r    Fblocksr   returnc                 C   sP  dd }dd | D }dd | D }|r$dd | D }t dd |D }t| }t dd |D }	t|D ]P}
||
 j\}}|	| }|d	krd|||
 t||f| ||
< |||
 t||f||
< |r||
 jd
 }|| }|d	kr|||
 t||fd ||
< q5tj|d	d tj|d	d d}|rtj|d	d |d< |S )a  
        Collate data in `blocks`
    Args:
        blocks (`List[Dict[str, torch.LongTensor]]`):
            List of tensors that we need to batch together
        pad_token_id (`Optional[int]`, defaults to `None`):
            Pad token id of the model
        contain_labels (`bool`, defaults to `False`):
           Set True to also process the labels

    Returns:
        `Dict[str, torch.LongTensor]`: Batched data
    c                 S   s   t j|| j| fdd S )Ndim)r   cattodevicelong)blockpadsr   r   r   	pad_blockV   s   zcollate_data.<locals>.pad_blockc                 S      g | ]}|d  qS )r   r   r   r*   r   r   r   r   Y       z collate_data.<locals>.<listcomp>c                 S   r-   )r   r   r.   r   r   r   r   Z   r/   c                 S   r-   )labelsr   r.   r   r   r   r   \   r/   c                 S      g | ]}| d qS r#   sizer.   r   r   r   r   ]       c                 S   r1   r2   r3   r.   r   r   r   r   `   r5   r   r#   ir$   r   r0   )	maxr   r   shaper   oneszerosr&   r)   )r!   r   r   r,   input_ids_blocksattention_mask_blockslabel_blockslabel_max_lenbszinp_max_leni	block_bszblock_inp_lenpad_numblock_label_lenlabel_pad_numdatar   r   r   r   C   s4     r   train	tokenizerseqlennsamplessplitc                 C   s   t  s
ttd|dkrtdddd}n|dkr!tdddd}dd	d
 |d d d D }| |dd}g }t|D ],}td|j	j
d | d }	|	| }
|j	d d |	|
f }t|}|||d q=|S )Nget_wikitext2rG   wikitextzwikitext-2-raw-v1)rK   
validationtest c                 S   s   g | ]
}|d kr
dn|qS )rP   z 
r   )r   sr   r   r   r      s    z!get_wikitext2.<locals>.<listcomp>texti  ptreturn_tensorsr   r	   r   )r   ImportErrorr   formatr   joinr   randomrandintr   r7   r   	ones_liker   )rH   rI   rJ   rK   rF   rR   encdataset_r@   jinpr   r   r   r   rL   x   s     
rL   c                 C      t  s
ttd|dkrtddddid}n|dkr%tddddid}g }t|D ]K}	 td	t|d
 }| || d dd}|j	j
d
 |krKnq.td	|j	j
d
 | d
 }|| }	|j	d d ||	f }
t|
}||
|d q+|S )Nget_c4rG   
allenai/c4"en/c4-train.00000-of-01024.json.gzrK   
data_filesrN   'en/c4-validation.00000-of-00008.json.gzTr   r	   rR   rS   rT   r   r   rV   r   rW   r   r   rY   rZ   r   r   r7   r   r[   r   rH   rI   rJ   rK   rF   r]   r^   r@   r\   r_   r`   r   r   r   r   rb      0   
rb   c                 C   ra   )N
get_c4_newrG   rc   rd   re   rN   rg   Tr   r	   rR   rS   rT   r   rh   ri   r   r   r   rk      rj   rk   c                 C      t dNz(Loading the `ptb` dataset was deprecatedRuntimeErrorrH   rI   rJ   rK   r   r   r   get_ptb      rq   c                 C   rl   rm   rn   rp   r   r   r   get_ptb_new   rr   rs         dataset_nameseedc                 C   s   t | tj | tj | tttd}|dvr"td| | dv r1t|  dt	| | |vrCtdt	|
  d|  ||  }|||||dS )	a  
    Get the dataset from the original paper of GPTQ

    Args:
        dataset_name (`str`):
            Dataset name. Available options are `['wikitext2', 'c4', 'c4-new']`.
        tokenizer (`Any`):
            Tokenizer of the model
        nsamples (`int`, defaults to `128`):
            Number of samples
        seqlen (`int`, defaults to `2048`):
            The sequence length of the model
        seed (`int`, defaults to `0`):
            Seed
        split (`str`, defaults to `train`):
            Split of the dataset. Can be either "train" or "validation"
    Returns:
        `List[Dict[str,torch.LongTensor]]`: The tokenized dataset.
    )	wikitext2c4zc4-new)rG   rN   z7The split need to be 'train' or 'validation' but found >   ptb-newptbzD dataset was deprecated, only the following dataset are supported : zExpected a value in z but found )rH   rJ   rI   rK   )rY   rw   npr   manual_seedrL   rb   rk   r   listkeys)rv   rH   rJ   rI   rw   rK   get_dataset_mapget_dataset_fnr   r   r   get_dataset   s"   
r   )r	   N)FN)rG   )rt   ru   r   rG   )rY   typingr   r   r   r   numpyr|   r   optimum.utils.import_utilsr   r   datasetsr   strr   intr    boolr   rL   rb   rk   rq   rs   r   r   r   r   r   <module>   s\   
#
5