
    Xj5                         d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZ  e
e          Z G d dee          Z G d d	ee          Z G d
 d          ZdedefdZdS )    N)Enum)Any)Session)
get_loggerlog_executionc                       e Zd ZdZdZdZdS )DatasetSplittrain
validationtestN)__name__
__module____qualname__TRAIN
VALIDATIONTEST     W/lsinfo/ai/hellotax_ai/training_center/backend/app/services/training_dataset_manager.pyr	   r	      s        EJDDDr   r	   c                       e Zd ZdZdZdZdS )ExportFormatjsoncsvjsonlN)r   r   r   JSONCSVJSONLr   r   r   r   r      s        D
CEEEr   r   c                      e Zd ZdefdZ ee          ddedededede	dz  d	e	ee
f         fd
            Zdededed	e	ee
f         fdZddedededededz  de	dz  d	e	ee
f         fdZ ee          ddedededed	e	eef         f
d            Zded	e	ee
f         fdZddedededz  d	efdZd	e	ee
f         fdZd	e	ee
f         fdZdS ) TrainingDatasetManagerdbc                     || _         d S Nr    )selfr    s     r   __init__zTrainingDatasetManager.__init__   s    r   Nnamedescriptiondataset_type
created_bymetadatareturnc                 J   ddl m}  ||||d|dddd|pi 
  
        }| j                            |           | j                                         | j                            |           t                              d|j         d|            | 	                    |          S )Nr   TrainingDataset   )
r&   r'   r(   versionr)   total_samplestrain_samplesvalidation_samplestest_samplesr*   zCreated dataset z: )

app.modelsr.   r    addcommitrefreshloggerinfoid_dataset_to_dict)r$   r&   r'   r(   r)   r*   r.   datasets           r   create_datasetz%TrainingDatasetManager.create_dataset   s    ......!/tS_ij  xB  RS  cd  yz  IJ  U]  Uc  ac  d  d  dG   ;wz;;T;;<<<$$W---r   
dataset_idc                    ddl m} 	 | j                            |                              |j        |k                                                                              }|st          d| d           ||j	        ||j
        |j        dz   ||dddd|j                                                  }| j                            |           | j                                         | j                            |           t"                              d|j         d|j	                    |                     |          S # t(          $ r | j                                          w xY w)	Nr   r-   Dataset 
 not foundr/   )r&   r'   r(   r0   parent_version_idr)   r1   r2   r3   r4   	meta_datazCreated dataset version z for dataset )r5   r.   r    queryfilterr;   with_for_updatefirst
ValueErrorr&   r(   r0   rD   copyr6   r7   r8   r9   r:   r<   	Exceptionrollback)r$   r?   r'   r)   r.   parentr=   s          r   create_versionz%TrainingDatasetManager.create_version%   s   ......	W]]?33::?;MQ[;[\\llnnttvvF D !BJ!B!B!BCCC%o6;K^d^q  |B  |J  MN  |N  bl  yC  ST  de  z{  JK  W]  Wg  Wl  Wl  Wn  Wn  o  o  oGGKK   GNNGOOG$$$KK^7?^^QWQ\^^___((111 	 	 	G	s   D9E %E'contentlabelsplitsource_task_idc                    ddl m}m} | j                            |                              |j        |k                                              }	|	st          d| d           |||||||pi           }
| j        	                    |
           |	xj
        dz  c_
        |t          j        k    r|	xj        dz  c_        nA|t          j        k    r|	xj        dz  c_        n |t          j        k    r|	xj        dz  c_        | j                                         | j                            |
           t(                              d| d|            |                     |
          S )	Nr   DatasetSampler.   rA   rB   )r?   rO   rP   rQ   rR   r*   r/   zAdded sample to dataset z, split=)r5   rU   r.   r    rE   rF   r;   rH   rI   r6   r1   r	   r   r2   r   r3   r   r4   r7   r8   r9   r:   _sample_to_dict)r$   r?   rO   rP   rQ   rR   r*   rU   r.   r=   samples              r   
add_samplez!TrainingDatasetManager.add_sample5   s   ========'--00778Jj8XYY__aa 	@>
>>>???*gUZ_p~  JR  JX  VX  Y  Y  YF"L&&&!!Q&!!!l---&&!+&&&l'''  A%  JzJJ5JJKKK##F+++r   ffffff?333333?train_ratiovalidation_ratio
test_ratioc                    dd l }ddlm} t          ||z   |z   dz
            dk    rt	          d          | j                            |                              |j        |k              	                                }|
                    |           t          |          }t          ||z            }	|	t          ||z            z   }
t          |          D ]F\  }}||	k     rt          j        |_        ||
k     rt          j        |_        5t          j        |_        G| j                                         t(                              d| d|	 d|
|	z
   d	||
z
              |	|
|	z
  ||
z
  d
S )Nr   rU   g      ?g{Gz?zSplit ratios must sum to 1.0zSplit dataset z: train=z, val=z, test=)r
   r   r   )randomr5   rU   absrI   r    rE   rF   r?   allshufflelenint	enumerater	   r   rQ   r   r   r7   r9   r:   )r$   r?   r[   r\   r]   r`   rU   samplestotal	train_endval_endirW   s                r   split_datasetz$TrainingDatasetManager.split_datasetH   s   ,,,,,,{--
:S@AADHH;<<<'--..55m6NR\6\]]aaccwG+,,	c%*:":;;;"7++ 	1 	1IAv9}}+1W+6+0wZwwww'T]J]wwfknufuwwxxx"'I2EuW^___r   c                 
   ddl m}m} | j                            |                              |j        |k                                              }|st          d| d          | j                            |                              |j	        |k              
                                }i }|D ](}|                    |j        d          dz   ||j        <   )|j        |j        |j        |j        |t#          |          dS )Nr   rT   rA   rB   r/   )r1   r2   r3   r4   label_distribution
num_labels)r5   rU   r.   r    rE   rF   r;   rH   rI   r?   rb   getrP   r1   r2   r3   r4   rd   )r$   r?   rU   r.   r=   rg   
label_distrW   s           r   get_quality_metricsz*TrainingDatasetManager.get_quality_metrics_   s:   ========'--00778Jj8XYY__aa 	@>
>>>???'--..55m6NR\6\]]aacc
 	K 	KF'1~~flA'F'F'JJv|$$!(!6I^v}  wQ  cj  cw  OY  il  mw  ix  ix  y  y  	yr   formatc                     ddl m}  j                            |                              |j        |k              }|r|                    |j        |k              }|                                }|t          j	        k    r"t          j         fd|D             d          S |t          j        k    r!d                     fd|D                       S |t          j        k    rt          j                    }t#          j        |g d	          }|                                 |D ]0}	|                    |	j        |	j        |	j        |	j        d           1|                                S d S )
Nr   r_   c                 :    g | ]}                     |          S r   )rV   .0sr$   s     r   
<listcomp>z9TrainingDatasetManager.export_dataset.<locals>.<listcomp>q   s'    HHH1t33A66HHHr      )indent
c                 ^    g | ])}t          j                            |                    *S r   )r   dumpsrV   rv   s     r   ry   z9TrainingDatasetManager.export_dataset.<locals>.<listcomp>s   s1    SSSadj)=)=a)@)@AASSSr   )r;   rO   rP   rQ   )
fieldnames)r5   rU   r    rE   rF   r?   rQ   rb   r   r   r   r~   r   joinr   ioStringIOr   
DictWriterwriteheaderwriterowr;   rO   rP   getvalue)
r$   r?   rs   rQ   rU   rE   rg   outputwriterrx   s
   `         r   export_datasetz%TrainingDatasetManager.export_datasetj   su   ,,,,,,m,,33M4LPZ4Z[[ 	?LL!4!=>>E))++\&&&:HHHHHHHQRSSSS|)))99SSSS7SSSTTT|'''[]]F^F7Z7Z7Z[[[F    h hqt	AG^_^e f fgggg??$$$ ('r   c                    |j         |j        |j        |j        |j        |j        |j        |j        |j        |j	        |j
        |j        r|j                                        nd |j        r|j                                        nd |j        dS )N)r;   r&   r'   r(   r0   rC   r1   r2   r3   r4   r)   
created_at
updated_atrD   )r;   r&   r'   r(   r0   rC   r1   r2   r3   r4   r)   r   	isoformatr   rD   )r$   r=   s     r   r<   z'TrainingDatasetManager._dataset_to_dict|   s6   j',wGZls  mA  NU  N]  t{  tM  `g  `u  HO  H]  u|  uO  ah  au  EL  EW  IP  I[  ge  gn  gy  gC  gC  gE  gE  gE  ae  W^  Wi  us  u|  uG  uQ  uQ  uS  uS  uS  os  B	I	  B	S	  T	  T	  	T	r   c           	          |j         |j        |j        |j        |j        |j        |j        r|j                                        nd |j        dS )N)r;   r?   rO   rP   rQ   rR   r   rD   )	r;   r?   rO   rP   rQ   rR   r   r   rD   )r$   rW   s     r   rV   z&TrainingDatasetManager._sample_to_dict   s    iv/@V^flfr  ~D  ~J  ^d  ^s  dj  du  C  CI  CT  C^  C^  C`  C`  C`  {  NT  N^  _  _  	_r   r"   )NN)rY   rZ   rZ   )r   r   r   r   r%   r   r9   strre   dictr   r>   rN   r	   rX   floatrl   rr   r   r   r<   rV   r   r   r   r   r      s       7     ]6. .3 .S . .Y\ .hloshs .  ~B  CF  HK  CK  ~L . . . . 3 C TXY\^aYaTb     , ,S ,3 ,s ,< ,ilosis ,  EI  LP  EP ,  Z^  _b  dg  _g  Zh , , , ,& ]6` ` `% `W\ `ot `  C  DG  IL  DL  M ` ` ` `,	yc 	yd38n 	y 	y 	y 	y% % %l %<Z^K^ %hk % % % %$T	4S> T	 T	 T	 T	_c3h _ _ _ _ _ _r   r   r    r+   c                      t          |           S r"   )r   r#   s    r   get_training_dataset_managerr      s    !"%%%r   )r   r   r   enumr   typingr   sqlalchemy.ormr   common_loggingr   r   r   r9   r   r	   r   r   r   r   r   r   <module>r      s;   



 				              " " " " " " 4 4 4 4 4 4 4 4	H		    3   
    3   
j_ j_ j_ j_ j_ j_ j_ j_X&W &1G & & & & & &r   