
    vjE                        d dl Z d dlmZ d dlmZmZmZ d dlmZm	Z	 d dl
Z
d dlZd dlmZ d dlmZ d dlmZ de	eee         f         fd	Ze G d
 d                      Ze G d d                      Ze G d d                      Z ed           G d deee                      Zd Zdede	eeedf         fdZdS )    N)deepcopy)	dataclassfieldfields)ListUnion)CliArgumentParser)Config)DEFAULT_DATASET_NAMESPACEvaluesc                    t          | t                    r|                     d          n| }i }|pg D ]R}t          |                                          dk    r(|                    d          \  }}t          |          ||<   S|S )N,r   =)
isinstancestrsplitlenstripparse_value)r   pairs_paramskvkeyvalues         q/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/modelscope/trainers/training_args.pyset_flatten_valuer      s    !+FC!8!8DFLLfEGkr * *rxxzz??aXXc]]
U"5))N    c                   x   e Zd ZU  edddi          Zeed<    edddi          Zeed<    edddi          Zeed	<    eddd
i          Z	eed<    edddi          Z
eed<    edddi          Zeed<    eeddi          Zeed<    eeddi          Zeed<    edddi          Zeed<   dS )DatasetArgsNhelpzNThe dataset name used for training, can be an id in the datahub or a local dirdefaultmetadatatrain_dataset_namezOThe subset name used for evaluating, can be an id in the datahub or a local dirval_dataset_namez.The subset name used for training, can be Nonetrain_subset_namez0The subset name used for evaluating, can be Noneval_subset_namezThe split of train datasettrain_splitzThe split of val dataset	val_splitz'The dataset namespace used for trainingtrain_dataset_namespacez)The dataset namespace used for evaluatingval_dataset_namespacea  The json file to parse all datasets from, used in a complex dataset scenario,the json format should be like:
                    [
                        {
                            "dataset": {
                                # All args used in the MsDataset.load function
                                "dataset_name": "xxx",
                                ...
                            },
                            # All columns used, mapping the column names in each dataset in same names.
                            "column_mapping": {
                                "text1": "sequence1",
                                "text2": "sequence2",
                                "label": "label",
                            },
                            # float or str, float means to split the dataset into train/val,
                            # or just str(train/val)
                            "split": 0.8,
                        }
                    ]
                    dataset_json_file)__name__
__module____qualname__r   r$   r   __annotations__r%   r&   r'   r(   r)   r   r*   r+   r,    r   r   r   r      s         $e\
      "E]
  c    #UD
  s    !5F
  OS    u0 
  K   
 U. 
  Is   
 $)5)=
$ $ $S    "')?
" " "3    #U
  s     r   r   c                       e Zd ZU  edddd          Zeed<    edddi          Zeed<    eddd	i          Zeed
<    edddd          Z	eed<   dS )	ModelArgsNzThe task code to be usedtaskr    cfg_noder!   r    zA model id or model dirmodelzthe revision of modelmodel_revisionzJThe mode type, if load_model_config is False, user need to fill this fieldz
model.type
model_type)
r-   r.   r/   r   r4   r   r0   r7   r8   r9   r1   r   r   r3   r3   j   s         .
 
  D#    - 
  E3   
  %+ 
  NC   
 e Y$
 
  J     r   r3   c                      e Zd ZU  edddi          Zeed<    edddd	          Zeed
<    edddd	          Zeed<    edddd	          Z	e
ed<    edddd	          Ze
ed<    edddd	          Zeed<    edddd	          Zeed<    edddd	          Ze
ed<    eddd d	          Ze
ed!<    ed"d#d$d	          Zeed%<    ed&d'd(d	          Zeed)<    ed*d+d,d	          Zeed-<    ed.d/d0d	          Zeed1<    ed2d3d4d	          Zeed5<    ed6d7d8ed9          Zeed:<    ed6d;d<ed9          Zeed=<    ed>d?d@g dAdB          ZeedC<    edddDi          ZeedE<    ed"dFdGdH          ZeedI<    ed>dJdKg dAdL          ZeedM<    edNdOdPdH          ZeedQ<    ed6dRdSdH          ZeedT<    ed>dUdVg dAdL          Z eedW<    edNdXdYdH          Z!eedZ<    edd[d\dH          Z"e
ed]<    ed6d^d_dH          Z#eed`<    edadbdcdH          Z$eedd<    ed6dedfdH          Z%eedg<    edNdhdidH          Z&eedj<    eddkdldH          Z'e
edm<    ed6dndodH          Z(eedp<    ed6dqdrdH          Z)eeds<    edtdudvdH          Z*e
edw<    edxdydzdH          Z+eed{<    eddkd|dH          Z,e
ed}<    ed6dnd~dH          Z-eed<    ed6dqddH          Z.eed<    edtduddH          Z/e
ed<    edxdyddH          Z0eed<   d6S )	TrainArgs*   r    zThe random seedr!   seed   z#train.dataloader.batch_size_per_gpuz:The `batch_size_per_gpu` argument for the train dataloader)r6   r    per_device_train_batch_sizer   z train.dataloader.workers_per_gpuz7The `workers_per_gpu` argument for the train dataloadertrain_data_workerFztrain.dataloader.shufflez/The `shuffle` argument for the train dataloadertrain_shuffleztrain.dataloader.drop_lastz1The `drop_last` argument for the train dataloadertrain_drop_lastz(evaluation.dataloader.batch_size_per_gpuz9The `batch_size_per_gpu` argument for the eval dataloaderper_device_eval_batch_sizez%evaluation.dataloader.workers_per_gpuz6The `workers_per_gpu` argument for the eval dataloadereval_data_workerzevaluation.dataloader.shufflez.The `shuffle` argument for the eval dataloadereval_shufflezevaluation.dataloader.drop_lastz0The `drop_last` argument for the eval dataloadereval_drop_last   ztrain.max_epochszThe training epochs
max_epochsz./train_targetztrain.work_dirz%The directory to save models and logswork_dirg-C6
?ztrain.optimizer.lrz"The learning rate of the optimizerlrLinearLRztrain.lr_scheduler.typezThe lr_scheduler type in torchlr_schedulerAdamWztrain.optimizer.typez+The optimizer type in PyTorch, like `AdamW`	optimizerNztrain.optimizerzThe optimizer params)r6   r    
cfg_setteroptimizer_paramsztrain.lr_schedulerzThe lr scheduler paramslr_scheduler_paramsby_epochz&train.lr_scheduler.options.lr_strategyzThe lr decay strategy)rR   by_stepno)r6   r    choiceslr_strategyzThe local rank
local_rankz+The interval of iter of logging informationztrain.logging.intervalr5   logging_intervalz5Eval strategy, can be `by_epoch` or `by_step` or `no`zevaluation.period.eval_strategy)r    r6   rU   eval_strategy   zEval intervalzevaluation.period.intervaleval_intervalzThe metric name for evaluationzevaluation.metricseval_metricsz>Checkpointing strategy, can be `by_epoch` or `by_step` or `no`z%train.checkpoint.period.save_strategysave_strategyz9The interval of epoch or iter of saving checkpoint periodz train.checkpoint.period.intervalsave_intervalz;Save the checkpoint(if it's the best) after the evaluation.ztrain.checkpoint.best.save_bestsave_best_checkpointz%The metric used to measure the model.z train.checkpoint.best.metric_keymetric_for_best_modelmaxzDThe rule to measure the model with the metric, can be `max` or `min`ztrain.checkpoint.best.rulemetric_rule_for_best_modelzBThe max number of checkpoints to keep, older ones will be deleted.z*train.checkpoint.period.max_checkpoint_nummax_checkpoint_numzGThe max number of best checkpoints to keep, worse ones will be deleted.z(train.checkpoint.best.max_checkpoint_nummax_checkpoint_num_bestz$Push to hub after each checkpointingz#train.checkpoint.period.push_to_hubpush_to_hubz<The repo id in modelhub, usually the format is "group/model"z#train.checkpoint.period.hub_repo_idrepo_idzYThe modelhub token, you can also set the token to the env variable `MODELSCOPE_API_TOKEN`z!train.checkpoint.period.hub_token	hub_tokenTzUpload to a private hubz#train.checkpoint.period.private_hubprivate_hubmasterzWhich branch to commit toz$train.checkpoint.period.hub_revisionhub_revisionz!train.checkpoint.best.push_to_hubpush_to_hub_bestz!train.checkpoint.best.hub_repo_idrepo_id_bestztrain.checkpoint.best.hub_tokenhub_token_bestz!train.checkpoint.best.private_hubprivate_hub_bestz"train.checkpoint.best.hub_revisionhub_revision_best)1r-   r.   r/   r   r=   intr0   r?   r@   rA   boolrB   rC   rD   rE   rF   rH   rI   r   rJ   floatrL   rN   r   rP   rQ   rV   rW   rX   rY   r[   r\   r]   r^   r_   r`   rb   rc   rd   re   rf   rg   rh   rj   rk   rl   rm   rn   ro   r1   r   r   r;   r;      s~	         %
  D#   
 (-u=H
 
( ( (    #U:M
 
  s     %2E
 
  M4    "E4G
 
  OT    ',eBG
 
' ' '    "E?L
 
  c    7D
 
  L$    !59F
 
  ND    e*)
 
  J    E (;
 
  Hc    ,8
 
  B    14
 
  L#    U.A
 
  Is    "E)*+
 
  c     %u,-+
 
         u@+444
 
  K    e$
  J   
 "EA0
 
  c    K9444
 
  M3    #4
 
  M3    4,
 
  L#     M?444	
 
  M3     H:
 
  M3    "' K9
 
" " "$    "';:
 
" " "3    ',e S4
 
' ' '    $e QD
 
      $)5 VB
 
$ $ $S    :=
 
  K    5 K=
 
  GS    U h;
 
  Is    -=
 
  K    />
 
  L#    #U:;
 
  d     K;
 
  L#     % h9
 
  NC    #U-;
 
  d    #U/<
 
  s     r   r;   F)initc                   T    e Zd ZU  edddi          Zeed<   d ZddZdd	Z	d
 Z
dS )TrainingArgsFr    zeUse the configuration of the model, default will only use the parameters in the CLI and the dataclassr!   use_model_configc                     t          |                                          | _        t          |           D ],}|j        |v r!t          | |j        ||j                            -i | _        d S N)listkeysmanual_argsr   namesetattr_unknown_args)selfkwargsfs      r   __init__zTrainingArgs.__init__  se    .. 	6 	6AvaffQVn555r   Nc                 ,   t          |           }|                    |          \  }}d |D             }i }t          dt          |          d          D ]7}t	          ||dz                      |||                             dd          <   8t          |          }| xj        |j        z  c_        | j        	                    |           t          |                                          D ](\  }}	|!t          | |          rt          | ||	           )| S )zcConstruct a TrainingArg class by the parameters of CLI.

        Returns:
            Self
        c                 "    g | ]}|d vd|v
|S ))\
z--local-rank=r1   ).0items     r   
<listcomp>z*TrainingArgs.parse_cli.<locals>.<listcomp>  s7     
 
 
<''O4,G,G ,G,G,Gr   r      rZ   - )r	   parse_known_argsranger   r   replacevarsr{   r~   updater   itemshasattrr}   )
r   parser_argsparserargsunknown_unknowni	args_dictr   r   s
             r   	parse_clizTrainingArgs.parse_cli  s,    #4((//<<g
 
$
 
 
 q#g,,** 	P 	PA4?A4O4OHWQZ''R0011JJ	F..!!(+++"9--3355 	* 	*JC74#5#5c5)))r   c                 $   t                      }t          j                    }|| j        }t	          |           D ]}|j                            d          }|j                            d          pd }|a|j        | j        v s|sPt          |t                    r|g}|D ]5}|                    | |t          | |j                            i           6t          | |j                  ||j        <   |                    | j                   ||fS )zyConvert the TrainingArgs to the `Config`

        Returns:
            The Config, and extra parameters in dict.
        Nr6   rO   c                     | S rx   r1   xs    r   <lambda>z(TrainingArgs.to_config.<locals>.<lambda>  s    A r   )r
   addictDictrv   r   r#   getr|   r{   r   r   merge_from_dictgetattrr~   )r   ignore_default_configcfgr   r   r6   rO   _nodes           r   	to_configzTrainingArgs.to_config  s-    hhKMM	 ($($9! 	: 	:Az~~j11H55F++J#6T---5J-!(C00 .$,:!) H H++"JJwtQV/D/D$E$EFH H H H %,D!&$9$9	!&!!D.///I~r   c                 L    t          |           }|D ]}|j        |k    r|c S d S rx   )r   r|   )r   r   _fieldsr   s       r   get_metadatazTrainingArgs.get_metadata  s:    ,, 	 	Av}} tr   rx   )r-   r.   r/   r   rv   rq   r0   r   r   r   r   r1   r   r   ru   ru     s          #UP
  d           .   8    r   ru   c                    ddl m} g }g }t          | d          5 }t          j        |          }|D ]} |j        di |d                                         }|j        }|d                                         fd|D             }	ddlm	}
 ddlm
} dd	lm} fd
|j                                        D             }i }|D ]I}t          |d         |          r  ||d         j                  ||d         <   8|d         ||d         <   J |
|          }|                    d |	|                              |d                   }|d         }t          |t$                    r:|dv sJ |dk    r|                    |           Y|                    |           pt          |t(                    rd|cxk     rdk     sn J |                    |          }|                    |d                    |                    |d                    	 ddd           n# 1 swxY w Y   ddlm}  ||           ||          fS )aW  
    The filename format:
    [
        {
            "dataset": {
                "dataset_name": "xxx",
                ...
            },
            "column_mapping": {
                "text1": "sequence1",
                "text2": "sequence2",
                "label": "label",
            }
            "usage": 0.8,
        }
    ]
    r   )	MsDatasetrdatasetcolumn_mappingc                     g | ]}|v|	S r1   r1   )r   columnkeep_columnss     r   r   z+build_dataset_from_file.<locals>.<listcomp>  s*       !F,4N4N4N4N4Nr   )Features)Value)
ClassLabelc                 (    g | ]}|d          v |S )r   r1   )r   r   r   s     r   r   z+build_dataset_from_file.<locals>.<listcomp>  s.       qt|7K7K7K7K7Kr   rZ   c                     | S rx   r1   r   s    r   r   z)build_dataset_from_file.<locals>.<lambda>"  s    ! r   )remove_columnsfeaturesusage)trainvalr   )
train_sizetestN)concatenate_datasetsr1   )
modelscoper   openjsonloadto_hf_datasetcolumn_namesrz   datasetsr   r   r   r   r   r   dtypemaprename_columnsr   appendrr   train_test_splitr   )filenamer   	train_seteval_setr   ds_jsondsr   all_columnsr   r   r   r   r   new_featuresr   ds_dictr   r   s                     @r   build_dataset_from_filer     s>   $ %$$$$$IH	h		 %1)A,, #	1 #	1B$in55r)}55CCEEG!.K./4466L   %0  N *)))))&&&&&&++++++   "+1133  H L . .adJ// .).qtz):):L1&&)*1L1&&#8L11Lkk-% " ' ' (6~b9I6J'K'K  wKE%%% 
1 00000G##$$W----OOG,,,,!%//AAMMMMMMMMMA!22e2DD  !12220000G#	1%1 %1 %1 %1 %1 %1 %1 %1 %1 %1 %1 %1 %1 %1 %1N .-----	**,@,@,J,JJJs   G;H$$H(+H(r   returnc                 &   ddddd d d d}| |v r||          S d| v sd| v r*|                      dd                               dd          S t          j        d|           rt          |           S t          j        d|           rt	          |           S | S )	NTF)TruetrueFalsefalseNonenonenull"'r   z^\d+$z4[+-]?(?=\d*[.eE])(?=\.?\d)\d*\.?\d*(?:[eE][+-]?\d+)?)r   rematchrp   rr   )r   	const_maps     r   r   r   6  s     I 		}}S"%%--c2666	(E	"	" 5zz	I
 
 U||r   )r   copyr   dataclassesr   r   r   typingr   r   r   r   'modelscope.trainers.cli_argument_parserr	   modelscope.utils.configr
   modelscope.utils.constantr   r   r   r   r3   r;   ru   r   rr   rq   r   r1   r   r   <module>r      s   				       0 0 0 0 0 0 0 0 0 0           E E E E E E * * * * * * ? ? ? ? ? ?eCcN3     L L L L L L L L^        6 ^ ^ ^ ^ ^ ^ ^ ^B	 I I I I I;	9 I I IX>K >K >KBs uS%t%;<      r   