o
    "i$                     @   s   d Z ddlZddlZddlZddlmZmZmZmZm	Z	 ddl
m
Z
 ddlmZ ddlmZ ddlmZ ddlZeeZG dd	 d	eeZG d
d deeZG dd dZdedefddZdS )zh
Training Dataset Manager Service
Manages dataset creation, versioning, quality monitoring, and exports
    N)OptionalListDictAnyTuple)datetime)Enum)Session)funcc                   @      e Zd ZdZdZdZdZdS )DatasetSplitzDataset split typestrain
validationtestN)__name__
__module____qualname____doc__TRAIN
VALIDATIONTEST r   r   W/lsinfo/ai/hellotax_ai/training_center/backend/app/services/training_dataset_manager.pyr      
    r   c                   @   r   )ExportFormatzExport format typesjsoncsvjsonlN)r   r   r   r   JSONCSVJSONLr   r   r   r   r      r   r   c                   @   sD  e Zd ZdZdefddZ	d'dededed	ed
ee	 de	ee
f fddZdeded	ede	ee
f fddZ		d(dededededee d
ee	 de	ee
f fddZ			d)dedededede	eef f
ddZdede	ee
f fddZ	d'ded edee defd!d"Zde	ee
f fd#d$Zde	ee
f fd%d&ZdS )*TrainingDatasetManagerz#Training dataset management servicedbc                 C   s
   || _ d S Nr"   )selfr"   r   r   r   __init__#   s   
zTrainingDatasetManager.__init__Nnamedescriptiondataset_type
created_bymetadatareturnc                 C   sp   ddl m} ||||d|dddd|pi d
}| j| | j  | j| td|j d|  | 	|S )a.  
        Create new dataset

        Args:
            name: Dataset name
            description: Dataset description
            dataset_type: Type of dataset
            created_by: Creator user ID
            metadata: Additional metadata

        Returns:
            Created dataset data
        r   TrainingDataset   )
r'   r(   r)   versionr*   total_samplestrain_samplesvalidation_samplestest_samplesr+   zCreated dataset z: )

app.modelsr.   r"   addcommitrefreshloggerinfoid_dataset_to_dict)r%   r'   r(   r)   r*   r+   r.   datasetr   r   r   create_dataset&   s$   

z%TrainingDatasetManager.create_dataset
dataset_idc                 C   s   ddl m} zU| j||j|k  }|s!td| d||j	||j
|jd ||dddd|j d}| j| | j  | j| td|j d|j	  | |W S  tym } z| j   d	}~ww )
zCreate new dataset versionr   r-   Dataset 
 not foundr/   )r'   r(   r)   r0   parent_version_idr*   r1   r2   r3   r4   	meta_datazCreated dataset version z for dataset N)r5   r.   r"   queryfilterr;   with_for_updatefirst
ValueErrorr'   r)   r0   rC   copyr6   r7   r8   r9   r:   r<   	Exceptionrollback)r%   r?   r(   r*   r.   parentr=   er   r   r   create_versionQ   s@   

z%TrainingDatasetManager.create_versioncontentlabelsplitsource_task_idc                 C   s   ddl m}m} | j||j|k }	|	s td| d|||||||p)i d}
| j	|
 |	 j
d7  _
|tjkrF|	 jd7  _n|tjkrS|	 jd7  _n|tjkr_|	 jd7  _| j  | j|
 td| d|  | |
S )	zAdd sample to datasetr   r.   DatasetSampler@   rA   )r?   rO   rP   rQ   rR   r+   r/   zAdded sample to dataset z, split=)r5   r.   rT   r"   rD   rE   r;   rG   rH   r6   r1   r   r   r2   r   r3   r   r4   r7   r8   r9   r:   _sample_to_dict)r%   r?   rO   rP   rQ   rR   r+   r.   rT   r=   sampler   r   r   
add_sample|   s8   
	




z!TrainingDatasetManager.add_sampleffffff?333333?train_ratiovalidation_ratio
test_ratioc                 C   s  ddl m} ddl}t|| | d dkrtd| j||j|k	 }|
| t|}t|| }	|	t||  }
t|D ]\}}||	k rPtj|_qC||
k rYtj|_qCtj|_qC| j  td| d|	 d	|
|	  d
||
   |	|
|	 ||
 dS )zAuto-split dataset samplesr   rT   Ng      ?g{Gz?zSplit ratios must sum to 1.0zSplit dataset z: train=z, val=z, test=)r   r   r   )r5   rT   randomabsrH   r"   rD   rE   r?   allshufflelenint	enumerater   r   rQ   r   r   r7   r9   r:   )r%   r?   rZ   r[   r\   rT   r^   samplestotal	train_endval_endirV   r   r   r   split_dataset   s,   




*z$TrainingDatasetManager.split_datasetc                 C   s   ddl m}m} | j||j|k }|s td| d| j||j	|k
 }i }|D ]}||jdd ||j< q2|j|j|j|j|t|dS )z!Calculate dataset quality metricsr   rS   r@   rA   r/   )r1   r2   r3   r4   label_distribution
num_labels)r5   r.   rT   r"   rD   rE   r;   rG   rH   r?   r`   getrP   r1   r2   r3   r4   rb   )r%   r?   r.   rT   r=   re   
label_distrV   r   r   r   get_quality_metrics   s.   z*TrainingDatasetManager.get_quality_metricsformatc           
         s   ddl m}  j||j|k}|r||j|k}| }|tj	kr3t
j fdd|D ddS |tjkrDd fdd|D S |tjkrqt }tj|g d	d
}|  |D ]}	||	j|	j|	j|	jd	 q\| S dS )z"Export dataset in specified formatr   r]   c                    s   g | ]}  |qS r   )rU   .0sr%   r   r   
<listcomp>   s    z9TrainingDatasetManager.export_dataset.<locals>.<listcomp>   )indent
c                    s   g | ]
}t  |qS r   )r   dumpsrU   rq   rt   r   r   ru      s    )r;   rO   rP   rQ   )
fieldnamesN)r5   rT   r"   rD   rE   r?   rQ   r`   r   r   r   ry   r    joinr   ioStringIOr   
DictWriterwriteheaderwriterowr;   rO   rP   getvalue)
r%   r?   rp   rQ   rT   rD   re   outputwriterrs   r   rt   r   export_dataset   s&   


z%TrainingDatasetManager.export_datasetc                 C   sZ   |j |j|j|j|j|j|j|j|j|j	|j
|jr|j nd|jr'|j nd|jdS )zConvert dataset model to dictN)r;   r'   r(   r)   r0   rB   r1   r2   r3   r4   r*   
created_at
updated_atrC   )r;   r'   r(   r)   r0   rB   r1   r2   r3   r4   r*   r   	isoformatr   rC   )r%   r=   r   r   r   r<     s   z'TrainingDatasetManager._dataset_to_dictc              	   C   s4   |j |j|j|j|j|j|jr|j nd|jdS )zConvert sample model to dictN)r;   r?   rO   rP   rQ   rR   r   rC   )	r;   r?   rO   rP   rQ   rR   r   r   rC   )r%   rV   r   r   r   rU     s   z&TrainingDatasetManager._sample_to_dictr#   )NN)rX   rY   rY   )r   r   r   r   r	   r&   strrc   r   r   r   r>   rN   r   rW   floatrj   ro   r   r   r<   rU   r   r   r   r   r!       s    	

+

1

0

$ 
r!   r"   r,   c                 C   s   t | S )z%Get training dataset manager instance)r!   r$   r   r   r   get_training_dataset_manager(  s   r   )r   loggingr   r   typingr   r   r   r   r   r   enumr   sqlalchemy.ormr	   
sqlalchemyr
   r|   	getLoggerr   r9   r   r   r   r!   r   r   r   r   r   <module>   s"    
  
