o
    dix                     @   s2  d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZ d d	lmZmZmZ d d
lmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z& ddl'm(Z( e  rwd dl)Z)e re(ddrd dl*m+Z+ G dd deZ,eG dd deZ-dS )    N)	dataclassfield)Path)Optional)version)TrainingArgumentsDebugOption)EvaluationStrategy
FSDPOptionHubStrategyIntervalStrategySchedulerType)OptimizerNamesdefault_logdirlogger)	ExplicitEnumget_full_repo_nameis_accelerate_availableis_safetensors_availableis_torch_availableis_torch_bf16_cpu_availableis_torch_bf16_gpu_availableis_torch_tf32_availablelogging)	strtobool   )is_transformers_version>=4.38.0)AcceleratorConfigc                   @   s   e Zd ZdZdZdS )ORTOptimizerNameszb
    Stores the acceptable string identifiers for optimizers in `onnxruntime.training.optim`.
    adamw_ort_fusedN)__name__
__module____qualname____doc__ADAMW_ORT_FUSED r(   r(   o/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/optimum/onnxruntime/training_args.pyr!   9   s    r!   c                   @   s   e Zd ZU dZedddidZee ed< edddidZ	ee
 ed	< eddd
idZee
 ed< edddidZee ed< edddidZee ed< dd ZdS )ORTTrainingArgumentsaf  
    Parameters:
        optim (`str` or [`training_args.ORTOptimizerNames`] or [`transformers.training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`):
            The optimizer to use, including optimizers in Transformers: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor. And optimizers implemented by ONNX Runtime: adamw_ort_fused.
    adamw_hfhelpzThe optimizer to use.)defaultmetadataoptimFzzUse ModuleWithLoss Wrapper to compute loss inside the training loop, having this will help save memory for ORTModule Runs.use_module_with_lossa-  Configure ORTModule to save onnx models. Defaults to False.             The output directory of the onnx models by default is set to args.output_dir.             To change the output directory, the environment variable ORTMODULE_SAVE_ONNX_PATH can be             set to the destination directory path.	save_onnxNzQPrefix for the saved ORTModule file names. Must be provided if save_onnx is True.onnx_prefixWARNINGzConfigure ORTModule log level. Defaults to WARNING.             onnx_log_level can also be set to one of VERBOSE, INFO, WARNING, ERROR, FATAL.onnx_log_levelc                 C   s  | j d urtj| j | _ | jd u r!| j d ur!tj| j t | _| jd ur.tj| j| _| jd u r;t	 t
jk| _t| jtrLtdt | jj| _t| j| _t| j| _t| j| _t| j| _t| j| _| jdu rx| jtjkrxd| _| jtjkr| jd u s| jdkr| jdkrtd| j  | j| _n	t d| j d| jtjkr| jdkrt d| j d	| jtjkr| jd
kr| jt!| jkrt d| j t!| j| _| jtjkr| jd
kr| jt!| jkrt d| j t!| j| _| jtjkr| j"d
kr| j"t!| j"krt d| j" t!| j"| _"| j#r| j| jkr6t d| j" d| j d| jtjkr| j"| j dkr| jd
k sR| j"d
k r| jd
k r^| j"d
k skt d| j" d| j dd}| j"| | j|  dkrt d| j" d| j dt d| j" d| j dt$ }| j%r|st d| j% d| j%s|rtd| j% d | j#s| jtj&kr| j'd u rd| _'| j(d u r| j'd ur| j'dv| _(| j)d u r| j | _)| j*dkr,t+ r,| j,r| j,dkrtdt | j,| _-| j.s| j/r,| j0rt1 st d | j0s,t2j34 r,t5 s,t d!| j6r8| j.r8t d"| j7rD| j/rDt d#| j.rR| j-d$krRt d%| jtj&krl| jtjkrdt d&t+ slt d'zt8| j9| _9W n t y   t:| j9| _9Y nw | j;rtd(t t:j<| _9| j9t:j=krt+ rt>?t>?t2j@jAt>?d)k rt d*t>?t>?t2j@jAt>?d)kr| j6rt d+| jBr| jCst d,tDd-d s| j tjEd-< | jFtjEd.< t+ r| jGjHd/kr| jGjHd0krd1tjEv s| j6s	| j7rt d2t+ r8| jGjHd/kr8| jGjHd0kr%d1tjEv s8| jGjHd3kr8| j.s4| j/r8t d4| jId urHtd5t | jI| _J| jKd usT| jJd ur[| jLs[d| _L| jLrh| jJd u rhd6| _J| jLrd7}| jJtjE|d8 < | jKd ur| jKtjE|d9 < | j*dkrt+ r| jLrtM r| jNd u r| j6r| j.rtd: dt2jOj3jP_Qdt2jOjR_QntSd; t+ r| jNd ur| jNrtM rdt2jOj3jP_Qdt2jOjR_Qnt d<tM rdt2jOj3jP_Qdt2jOjR_Q| j-d$kr
tjETd=d>}| j6rd?}n| j.rd@}|tjEd=< | jUd u rtdA dB| _U| jUdBks%| jUdBgkr0ddClVmW} | | _Un| jUdDks=| jUdDgkrAg | _Unt| jUtXsM| jUg| _U| jYdk sY| jYd
kr]t dE| jYdkrn| jZdkrntdF t| j[t\r~| j[r{dGndH| _[t| j[t]rdIdJ | j[^ D | _[| j[t_j`gkrt dKt_ja| j[v rt_jb| j[v rt dL| jcd u ri | _ct| jct]rtd| j[dkrtdM tejf| jcdNdOdP.}tgh|| _ctX| jci D ]}|jdQr| jck|}|| jc|dRd  < qW d    n	1 sw   Y  | jldkrtdSt tm| jcTdTd| jl| jcdT< t| jcTdUd t]r9| jcdU g| jcdU< | jnd urStdVt | jcTdUg | jng | jcdU< td| j[dkrh| jcdT dkrhtdW td| j[dkr| jcTdUd d urtdX td| j[dkr| jcdT dkr| jcTdUd d urt dY| jcTd0d| jcd0< | jcTdZd| jcdZ< | jcTd[d| jcd[< | jcd0 rtd| j[dkr| jcTd\i | _od]| jov rtpt2| jod] | jod]< d^| jov rtpt2| jod^ | jod^< ntd_ n| jcd[ rtd` td| j[dkr| jcd0 sdatjEdb< ddclqmr}	ms}
 dd}| j[D ]v}|t |
v rAt]|
u|t d
 tjE| de< q%|t_j`krQdatjE| df< q%|t_jvkr|	d tjE| dg< | jcdT dkrt]| jcdT tjE| dh< |	d
 tjE| dg< q%| jcTdUd d urdi| jcdU tjE| dj< q%| jcTdkdl}|t tjE| dm< | jcTdndotjE| dp< | jcTdqdatjE| dr< | jcTdsdotjE| dt< tw r*txdudvr*t| jytzs| jyd u rtz | _ynt| jyt{rtzdi | jy| _yntz|| jy| _y| j}d urtdwt | j}| jy_}| j~d ur*tdxt | j~| jy_~| jrHtdyt | jd u r>dz| _n|  jdz7  _d| _t| jt]r[d{dJ | j^ D | _n	| jd u rdg | _d | _| jrtw sst d|dd}lm} || j| _| j|  dd~lm} datjEd< || jd| _n&ttjETddordd~lm} | | _tjETd=d>}| j| | j  | jd urtdt | j| _| jd urt| j| j| jd| _| jd urtd| j dt n*td| j dt n| jd ur| j dt| j j | _td| j dt | j-d$kr:tjETd=d>}| j6r/d?}n| j.r5d@}|tjEd=< | jdu rGtd d S td d S )Nu   using `EvaluationStrategy` for `eval_strategy` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `IntervalStrategy` insteadFTr   z4using `logging_steps` to initialize `eval_steps` to zevaluation strategy z9 requires either non-zero --eval_steps or --logging_stepszlogging strategy z" requires non-zero --logging_steps   z5--logging_steps must be an integer if bigger than 1: z2--eval_steps must be an integer if bigger than 1: z2--save_steps must be an integer if bigger than 1: z--load_best_model_at_end requires the saving steps to be a multiple of the evaluation steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps z and eval_steps .z--load_best_model_at_end requires the saving steps to be a multiple of the evaluation steps, which cannot get guaranteed when mixing ratio and absolute steps for save_stepsi@B zg--load_best_model_at_end requires the saving steps to be a multiple of the evaluation steps, but found z, which is not a multiple of zm--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation steps, but found z#, which is not a round multiple of z--save_safetensors=z& requires safetensors to be installed!z7Found safetensors installation, but --save_safetensors=z. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!loss)r7   	eval_lossptautoux   `fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `half_precision_backend` insteadzLYour setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10z[Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0z6At most one of fp16 and bf16 can be True, but not bothzDAt most one of fp16 and bf16 can be True for full eval, but not bothapexzt `--half_precision_backend apex`: GPU bf16 is not supported by apex. Use `--half_precision_backend cuda_amp` insteadz@lr_scheduler_type reduce_lr_on_plateau requires an eval strategyz<lr_scheduler_type reduce_lr_on_plateau requires torch>=0.2.0ur   `--adafactor` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--optim adafactor` insteadz2.0.0z8--optim adamw_torch_fused requires PyTorch 2.0 or higherz:--optim adamw_torch_fused with --fp16 requires PyTorch>2.0z1onnx_prefix must be provided if save_onnx is TrueORTMODULE_SAVE_ONNX_PATHORTMODULE_LOG_LEVELcudaxlaGPU_NUM_DEVICESzFP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA devices.cpuzBF16 Mixed precision training with AMP (`--bf16`) and BF16 half precision evaluation (`--bf16_full_eval`) can only be used on CUDA or CPU/TPU/NeuronCore devices.uv   `torchdynamo` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `torch_compile_backend` insteadinductorACCELERATE_DYNAMO_BACKENDMODEz`Setting TF32 in CUDA backends to speedup torch compile, you won't see any improvement otherwise.zaThe speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here.zC--tf32 requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7ACCELERATE_MIXED_PRECISIONnofp16bf16a  The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).all)$get_available_reporting_integrationsnonez$warmup_ratio must lie in range [0,1]zoBoth warmup_ratio and warmup_steps given, warmup_steps will override any effect of warmup_ratio during training
full_shard c                 S      g | ]}t |qS r(   )r   .0sr(   r(   r)   
<listcomp>      z6ORTTrainingArguments.__post_init__.<locals>.<listcomp>z`--fsdp offload` can't work on its own. It needs to be added to `--fsdp full_shard` or `--fsdp shard_grad_op`. For example, `--fsdp "full_shard offload"`.zB`--fsdp full_shard` is not compatible with `--fsdp shard_grad_op`.z:`--fsdp_config` is useful only when `--fsdp` is specified.rzutf-8)encodingfsdp_   zEusing `--fsdp_min_num_params` is deprecated. Use fsdp_config instead min_num_paramstransformer_layer_cls_to_wrapzTusing `--fsdp_transformer_layer_cls_to_wrap` is deprecated. Use fsdp_config instead z;`min_num_params` is useful only when `--fsdp` is specified.zJ`transformer_layer_cls_to_wrap` is useful only when `--fsdp` is specified.zL`min_num_params` and `transformer_layer_cls_to_wrap` are mutually exclusive.xla_fsdp_v2xla_fsdp_grad_ckptxla_fsdp_settingscompute_dtypebuffer_dtypez5XLA FSDP can be used only when `--fsdp` is specified.zB`--xla_fsdp_grad_ckpt` is useful only when `--xla` is set to true.trueACCELERATE_USE_FSDP)FSDP_AUTO_WRAP_POLICYFSDP_SHARDING_STRATEGYFSDP_SHARDING_STRATEGYOFFLOAD_PARAMSAUTO_WRAP_POLICYMIN_NUM_PARAMS,TRANSFORMER_CLS_TO_WRAPfsdp_backward_prefetchNO_PREFETCHBACKWARD_PREFETCHforward_prefectfalseFORWARD_PREFETCHsync_module_statesSYNC_MODULE_STATESuse_orig_paramsUSE_ORIG_PARAMSr   r   u   Using `--dispatch_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use `--accelerator_config {'dispatch_batches':VALUE} insteadu   Using `--split_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use `--accelerator_config {'split_batches':VALUE} insteadu   using `--tpu_metrics_debug` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--debug tpu_metrics_debug` insteadz tpu_metrics_debugc                 S   rO   r(   r   rP   r(   r(   r)   rS   	  rT   zJ--deepspeed requires Accelerate to be installed: `pip install accelerate`.)HfTrainerDeepSpeedConfig)DeepSpeedPluginACCELERATE_USE_DEEPSPEED)hf_ds_configuu   `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.)organizationtokenu   `--push_to_hub_model_id` and `--push_to_hub_organization` are deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_model_id` instead and pass the full repo name to this argument (in this case z).u   `--push_to_hub_model_id` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_model_id` instead and pass the full repo name to this argument (in this case /u   `--push_to_hub_organization` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_model_id` instead and pass the full repo name to this argument (in this case zeUsing ModuleWithLoss Wrapper.loss will be computed during training loop and it will save memory peak z!Not Using ModuleWithLoss Wrapper.r(   )
output_dirospath
expanduserlogging_dirjoinr   disable_tqdmr   getEffectiveLevelr   WARN
isinstanceeval_strategyr
   warningswarnFutureWarningvaluer   logging_strategysave_strategyr   hub_strategyr   lr_scheduler_typedo_evalNOSTEPS
eval_stepslogging_stepsinfo
ValueErrorint
save_stepsload_best_model_at_endr   save_safetensorsREDUCE_ON_PLATEAUmetric_for_best_modelgreater_is_betterrun_name	frameworkr   fp16_backendhalf_precision_backendrI   bf16_full_evaluse_cpur   torchr>   is_availabler   rH   fp16_full_evalr!   r/   r   	adafactor	ADAFACTORADAMW_TORCH_FUSEDr   parse__version__base_versionr1   r2   getenvenvironr4   devicetypetorchdynamotorch_compile_backendtorch_compile_modetorch_compiler   tf32backendsmatmul
allow_tf32cudnnwarningget	report_totransformers.integrationsrK   listwarmup_ratiowarmup_stepsfsdpboolstrsplitr   OFFLOAD
FULL_SHARDSHARD_GRAD_OPfsdp_configlenioopenjsonloadkeys
startswithpopfsdp_min_num_paramsmax"fsdp_transformer_layer_cls_to_wrapxla_fsdp_configgetattraccelerate.utils.constantsrb   rc   upperindex	AUTO_WRAPr   r   accelerator_configr    dictfrom_json_filedispatch_batchessplit_batchestpu_metrics_debugdebugdeepspeed_plugin	deepspeed#transformers.integrations.deepspeedru   hf_deepspeed_configtrainer_config_processaccelerate.utilsrv   r   set_mixed_precisionset_deepspeed_weakrefpush_to_hub_token	hub_tokenpush_to_hub_model_idr   push_to_hub_organizationhub_model_idr   namer0   )selfLARGE_MULTIPLIERsafetensors_availableprefixmixed_precision_dtyperK   fkvrb   rc   fsdp_optionprefetch_policyru   rv   mixed_precisionr(   r(   r)   __post_init__j   s*  



 

  ( 






 
$









z"ORTTrainingArguments.__post_init__)r#   r$   r%   r&   r   r/   r   r   __annotations__r0   r   r1   r2   r4   r   r(   r(   r(   r)   r*   A   s.   
 
	r*   ).r   r   r}   r   dataclassesr   r   pathlibr   typingr   	packagingr   transformersr   transformers.debug_utilsr	   transformers.trainer_utilsr
   r   r   r   r   transformers.training_argsr   r   r   transformers.utilsr   r   r   r   r   r   r   r   r   transformers.utils.genericr   utils.import_utilsr   r   transformers.trainer_pt_utilsr    r!   r*   r(   r(   r(   r)   <module>   s,   ,