
    )jt%                        d dl Z d dlZd dlmZ d dlmZmZmZ d dlm	Z	  G d d          Z
 G d d          Z G d	 d
          Z G d d          Z G d d          Zde	fdZdede	fdZdede	fdZde	fdZde	fdZdS )    N)Path)AnyDictList)PreTrainedTokenizerc                   ^    e Zd ZdZ	 ddeeeef                  dedefdZd Z	de
fd	Zd
 ZdS )TextDatasetz1
    Light-weight wrapper to hold a dataset.
    textdata	tokenizertext_keyc                 0    || _         || _        || _        d S N)_datar   r   )selfr   r   r   s       _/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/tuner/datasets.py__init__zTextDataset.__init__   s     
"     c                     | j                             || j                           }|d         | j         j        k    r|                    | j         j                   |dfS )Nr   )r   encoder   eos_token_idappendr   ds     r   processzTextDataset.process   sR    N!!!DM"233R5DN///HHT^01111vr   idxc                     | j         |         S r   r   r   r   s     r   __getitem__zTextDataset.__getitem__        z#r   c                 *    t          | j                  S r   lenr   r   s    r   __len__zTextDataset.__len__#       4:r   N)r
   )__name__
__module____qualname____doc__r   r   strr   r   r   intr!   r'    r   r   r	   r	      s          	! !4S>"! '! 	! ! ! !  s        r   r	   c            	       d    e Zd ZdZ	 	 ddeeeef                  dededefdZ	d	 Z
d
efdZd ZdS )ChatDatasetz
    A dataset for chat data in the format of {"messages": [...]}
    https://platform.openai.com/docs/guides/supervised-fine-tuning#formatting-your-data
    messagesFr   r   chat_keymask_promptc                 >    || _         || _        || _        || _        d S r   )r   r3   r4   r   )r   r   r   r3   r4   s        r   r   zChatDataset.__init__-   s%     
 &"r   c                 D   || j                  }|                    dd           }| j                            ||d          }| j        rV|d                             d          dk    }t          | j                            |d d         ||d                    }||fS |dfS )	NtoolsFr7   return_dictr   role	assistantr7   add_generation_promptr9   r   )r3   getr   apply_chat_templater4   r%   )r   r   r2   r7   tokensr=   offsets          r   r   zChatDataset.process9   s    T]#gt$$33 4 
 

  	$,RL$4$4V$<$<$K!22SbSM*? %	 3   F F##A;r   r   c                     | j         |         S r   r   r    s     r   r!   zChatDataset.__getitem__O   r"   r   c                 *    t          | j                  S r   r$   r&   s    r   r'   zChatDataset.__len__R   r(   r   N)r2   Fr)   r*   r+   r,   r   r   r-   r   boolr   r   r.   r!   r'   r/   r   r   r1   r1   '   s          #!
# 
#4S>"
# '
# 	
#
 
# 
# 
# 
#  ,s        r   r1   c            
       b    e Zd ZdZdeeeef                  dedededef
dZ	d Z
d	efd
Zd ZdS )CompletionsDatasetz
    A dataset for prompt-completion data in the format of {"prompt": ..., "completion": ...}
    or using user-provided keys for prompt and completion values
    https://platform.openai.com/docs/guides/fine-tuning/example-format
    r   r   
prompt_keycompletion_keyr4   c                 L    || _         || _        || _        || _        || _        d S r   )r   rH   rI   r4   r   )r   r   r   rH   rI   r4   s         r   r   zCompletionsDataset.__init__]   s-     
$,&"r   c                 ,   |                     dd           }d|| j                 dd|| j                 dg}| j                            ||d          }| j        r7t          | j                            |d d         |dd	                    }||fS |d
fS )Nr7   user)r:   contentr;   Fr8   r   Tr<   r   )r>   rH   rI   r   r?   r4   r%   )r   r   r7   r2   r@   rA   s         r   r   zCompletionsDataset.processk   s    gt$$$/(:;; Qt/B-CDD
 33Eu 4 
 
  		$22SbSM*. %	 3   F F##{r   r   c                     | j         |         S r   r   r    s     r   r!   zCompletionsDataset.__getitem__   r"   r   c                 *    t          | j                  S r   r$   r&   s    r   r'   zCompletionsDataset.__len__   r(   r   NrD   r/   r   r   rG   rG   V   s         #4S>"# '# 	#
 # # # # #  ,s        r   rG   c                   >    e Zd Zdee         fdZdefdZd Zd Z	dS )ConcatenatedDatasetr   c                 Z    || _         t          d | j         D                       | _        d S )Nc              3   4   K   | ]}t          |          V  d S r   )r%   ).0r   s     r   	<genexpr>z/ConcatenatedDataset.__init__.<locals>.<genexpr>   s(      331A333333r   )r   sum_lenr   r   s     r   r   zConcatenatedDataset.__init__   s-    
33
33333			r   r   c                     t          | j                  D ]!\  }}|t          |          z
  }|dk     r n|}"||         }||d<   |S )Nr   _dataset)	enumerater   r%   )r   r   data_idxr   jdatums         r   r!   zConcatenatedDataset.__getitem__   s[    '
33 	 	NHdc$iiA1uuCCS	$jr   c                 N    | j         |d                                      |          S )NrZ   )r   r   r   s     r   r   zConcatenatedDataset.process   s!    z!J-(00333r   c                     | j         S r   )rW   r&   s    r   r'   zConcatenatedDataset.__len__   s
    yr   N)
r)   r*   r+   r   r   r   r.   r!   r   r'   r/   r   r   rQ   rQ      sl        4T#Y 4 4 4 4s    4 4 4    r   rQ   c                   8    e Zd ZdefdZdefdZdefdZd ZdS )CacheDatasetr   c                 D    || _         d gt          |          z  | _        d S r   )r   r%   
_proc_datarX   s     r   r   zCacheDataset.__init__   s!    
&3t99,r   r   c                 6    t          | j        |                   S r   r$   r    s     r   itemlenzCacheDataset.itemlen   s    4:c?###r   c                     | j         |         -| j                            | j        |                   | j         |<   | j         |         S r   )rd   r   r   r    s     r   r!   zCacheDataset.__getitem__   s>    ?3'#':#5#5djo#F#FDOC s##r   c                 *    t          | j                  S r   r$   r&   s    r   r'   zCacheDataset.__len__   r(   r   N)	r)   r*   r+   r   r   r.   rf   r!   r'   r/   r   r   rb   rb      st        -S - - - -$3 $ $ $ $$s $ $ $ $
    r   rb   r   c                    t          |dd          }t          |dd          }t          |dd          }t          |dd          }t          |d	d
          }| d         }||v r||v rt          | ||||          S ||v rt          | |||          S ||v r#|rt          d          t	          | ||          S t          d          )Nr4   Fprompt_featureprompttext_featurer
   completion_feature
completionchat_featurer2   r   )r3   r4   z.Prompt masking not supported for text dataset.)r   z~Unsupported data format, check the supported formats here:
https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/LORA.md#Data.)getattrrG   r1   
ValueErrorr	   )	r   r   configr4   rj   rl   rm   ro   samples	            r   create_datasetrt      s   
 &-77KV%5x@@N6>6::L )=|LL6>:>>L!WF$6&$@$@!)^-?
 
 	
 
		)l
 
 
 	
 
		 	OMNNN4\BBBBR
 
 	
r   	data_pathc                 J     fdd} fd|D             \  }}}|||fS )Nc                     |                                  sg S t          | d          5 }d |D             }d d d            n# 1 swxY w Y   t          |          S )Nrc                 6    g | ]}t          j        |          S r/   )jsonloads)rT   ls     r   
<listcomp>z;load_local_dataset.<locals>.load_subset.<locals>.<listcomp>   s     ///aDJqMM///r   )existsopenrt   )pathfidr   rr   r   s      r   load_subsetz'load_local_dataset.<locals>.load_subset   s    {{}} 	I$__ 	0//3///D	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0dIv666s   AAAtrainvalidtestc                 2    g | ]} | d z            S )z.jsonlr/   )rT   nru   r   s     r   r}   z&load_local_dataset.<locals>.<listcomp>   s.    OOOA++iQ,,,&>??OOOr   r/   )ru   r   rr   namesr   r   r   r   s   ```    @r   load_local_datasetr      s[    
7 7 7 7 7 7 'EOOOOOOOOE5$%r   data_idc                    	 ddl m}m} 	  ||           	d}	fd|D             \  }}}n!# |j        $ r t	          d|  d          w xY w|||fS )Nr   )
exceptionsload_datasetr   c                 n    g | ]1}|                                 v rt          |                   ng 2S r/   )keysrt   )rT   r   rr   datasetr   s     r   r}   z#load_hf_dataset.<locals>.<listcomp>   sV     
 
 
  && wqz9f===	
 
 
r   z Not found Hugging Face dataset: z .)datasetsr   r   DatasetNotFoundErrorrq   )
r   r   rr   r   r   r   r   r   r   r   s
    ``      @r   load_hf_datasetr      s    
 21111111I,w''*
 
 
 
 
 
 
 
 
udd * I I IGGGGGHHHI %s	   !/ Ac                    dd l fd}| j        }t          |t                    r|g}g }|D ]}|d         }t	          d| d           t          | dd          |d<   t          j        di |}|                    di           }| j	        rI|                    d	d
          }	|                    dd          }
 ||||	|          } ||||
|          }ng g }}| j
        r$|                    d          } |||||          }ng }|                    |||f           t          |          dk    r|d         S t          t          t          t!          |                     S )Nr   c                 F     j         | fd|i|}t          ||          S )Nsplit)r   rt   )dataset_namerr   r   	hf_configdsr   r   s        r   create_hf_datasetz1load_custom_hf_dataset.<locals>.create_hf_dataset   sE    "X"
 

 
 

 b)V444r   r   Loading Hugging Face dataset .r4   Frr   train_splitztrain[:80%]valid_splitztrain[-10%:]
test_split   r/   )r   
hf_dataset
isinstancedictprintrp   typesSimpleNamespacer>   r   r   r   r%   tuplemaprQ   zip)argsr   r   dataset_collection
collectionr   ds_pathrr   r   r   r   r   r   r   r   r   s    `             @r   load_custom_hf_datasetr      s   OOO5 5 5 5 5 5 $d++ 201J  #0 #0V*8g888999#D-??=&,,,,FF8R((	: 	"&&>>K&&??K%%	 E &%	 EE r5E9 		--J$$	 DD D5%.////
:!!} (#z*:;;<<<r   c                 .   t          | dd          rt          | |          \  }}}npt          | j                  }|                                rt          |||           \  }}}n2t          d| j         d           t          | j        ||           \  }}}| j        r"t          |          dk    rt          d          | j        r"t          |          dk    rt          d           | j        r"t          |          dk    rt          d          |||fS )	Nr   Fr   r   r   zKTraining set not found or empty. Must provide training set for fine-tuning.zUWarning: Validation set not found or empty. Training will proceed without validation.zBTest set not found or empty. Must provide test set for evaluation.)rp   r   r   r   r~   r   r   r   r   r%   rq   r   )r   r   r   r   r   ru   s         r   r   r   5  s;   t\5)) M3D)DDuddOO	 	M!3Iy$!O!OE5$$>$)>>>???!0It!L!LE5$z 
c%jjAooY
 
 	
 z 
c%jjAooc	
 	
 	
 y 
SYY!^^P
 
 	
 %r   )rz   r   pathlibr   typingr   r   r   transformersr   r	   r1   rG   rQ   rb   rt   r   r-   r   r   r   r/   r   r   <module>r      s           " " " " " " " " " " , , , , , ,       8, , , , , , , ,^/ / / / / / / /d       ,       "
"
 
 
 
<"   ""   69=,? 9= 9= 9= 9=x"5      r   