o
    di                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
mZmZmZmZmZmZmZ ddlZddlmZ ddlmZ ddlmZ e ruddlmZ ddlmZ eeed	krldd
lmZ ndZddlm Z  ne!dddl"m#Z$ ddl%Z%ddl&m'Z( ddl%m)Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z;m<Z< ddl=m>Z>m?Z?m@Z@ ddlAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJ ddlKmLZL ddlmMZMmNZNmOZOmPZPmQZQ ddl#mRZR ddlSmTZT ddlUmVZVmWZW ddl#mXZX eO rddlYmZZZ eTd d!r,dd"l[m\Z\m]Z]m^Z^ n
dd"l_m\Z\m]Z]m^Z^ eTd d#rPdd$lm`Za ea rOddlbmc  mdZe ndd%lmfZa ead&d'reddlbmc  mdZe e
rlddlgZgd(Zhd)ZieRjekZlG d*d+ d+e3ZmG d,d- d-e8ZndS ).ux   
The ORTTrainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task with ONNX Runtime.
    N)TYPE_CHECKINGAnyCallableDictListOptionalTupleUnion)	hp_params)is_accelerate_available)version)__version__)DistributedDataParallelKwargsz0.16)skip_first_batches)DistributedTypezThe package `accelerate` is required to use the ORTTrainer. Please install it following https://huggingface.co/docs/accelerate/basic_tutorials/install.)nn)DatasetRandomSampler)DataCollator)DebugOptionDebugUnderflowOverflow)PreTrainedModelunwrap_model)PreTrainedTokenizerBase)Trainer)ExportableStateTrainerCallbackTrainerState)get_model_param_countget_module_class_from_nameget_parameter_names)	EvalPredictionHPSearchBackendTrainOutputenable_full_determinismfind_executable_batch_sizeget_last_checkpoint
has_lengthset_seedspeed_metrics)ParallelMode)SAFE_WEIGHTS_NAMEWEIGHTS_NAMEis_apex_availableis_sagemaker_dp_enabledis_sagemaker_mp_enabled   )logging)is_transformers_version   )ORTOptimizerNamesORTTrainingArguments)!is_onnxruntime_training_available)amp>=z4.33)deepspeed_initdeepspeed_load_checkpointis_deepspeed_zero3_enabledz4.39)is_torch_xla_available)is_torch_tpu_availableF)check_deviceztrainer_state.jsonztraining_args.binc                       sT   e Zd Z fddZdeeeeje	f f fddZ
edd Zedd	 Z  ZS )
ModuleWithLossc                    s    t    || _|| _|| _d S N)super__init___original_modelargslabel_smoother)selfmodelrD   rE   	__class__ i/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/optimum/onnxruntime/trainer.pyrB      s   

zModuleWithLoss.__init__inputsc                 C   s   |  | j|||S r@   ) compute_model_plus_loss_internalrC   )rF   rL   return_outputsnum_items_in_batchrJ   rJ   rK   forward   s   zModuleWithLoss.forwardc                 C      | j jS )zThe original `torch.nn.Module` that this module wraps.
        This property provides access to methods and properties on the original module.)rC   modulerF   rJ   rJ   rK   rR      s   zModuleWithLoss.modulec                 C   rQ   r@   )rC   configrS   rJ   rJ   rK   rT      s   zModuleWithLoss.config)__name__
__module____qualname__rB   r   strr	   torchTensorr   rP   propertyrR   rT   __classcell__rJ   rJ   rH   rK   r?      s     
r?   c                       s  e Zd ZdZ											d+deeejf dede	e
 de	e de	eeeeef f  d	e	e d
e	eg ef  de	eegef  de	ee  deejjejjjf de	eejejgejf  f fddZdd Zdd Zd, fdd	Z			d-de	eeef  dedeeef f de	ee  fddZ 	d.ddZ!d/d!d"Z"d#d$ Z#e$ded%eeef fd&d'Z%dd(e	e fd)d*Z&  Z'S )0
ORTTraineru0  
    ORTTrainer is a simple but feature-complete training and eval loop for ONNX Runtime, optimized for 🤗 Transformers.

    Args:
        model ([`~transformers.PreTrainedModel`] or `torch.nn.Module`, *optional*):
            The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed.

            <Tip>

            [`ORTTrainer`] is optimized to work with the [`~transformers.PreTrainedModel`] provided by the transformers library.
            You can still use your own models defined as `torch.nn.Module` for training with ONNX Runtime backend
            and inference with PyTorch backend as long as they work the same way as the 🤗 Transformers models.

            </Tip>

        args ([`ORTTrainingArguments`], *optional*):
            The arguments to tweak for training. Will default to a basic instance of [`ORTTrainingArguments`] with the
            `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided.
        data_collator ([`~transformers.DataCollator`], *optional*):
            The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will
            default to [`~transformers.default_data_collator`] if no `tokenizer` is provided, an instance of
            [`~transformers.DataCollatorWithPadding`] otherwise.
        train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*):
            The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the
            `model.forward()` method are automatically removed.
            Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
            distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
            `torch.Generator` for the randomization that must be identical on all processes (and the ORTTrainer will
            manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
            sets the seed of the RNGs used.
        eval_dataset (Union[`torch.utils.data.Dataset`, Dict[str, `torch.utils.data.Dataset`]), *optional*):
            The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the
            `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
            dataset prepending the dictionary key to the metric name.
        tokenizer ([`~transformers.PreTrainedTokenizerBase`], *optional*):
            The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs the
            maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
            interrupted training or reuse the fine-tuned model.
        model_init (`Callable[[], PreTrainedModel]`, *optional*):
            A function that instantiates the model to be used. If provided, each call to [`ORTTrainer.train`] will start
            from a new instance of the model as given by this function.
            The function may have zero argument, or a single one containing the optuna/Ray Tune/SigOpt trial object, to
            be able to choose different architectures according to hyper parameters (such as layer count, sizes of
            inner layers, dropout probabilities etc).
        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
            The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
            a dictionary string to metric values.
        callbacks (List of [`TrainerCallback`], *optional*):
            A list of callbacks to customize the training loop. Will add those to the list of default callbacks
            detailed in [here](callback).
            If you want to remove one of the default callbacks used, use the [`ORTTrainer.remove_callback`] method.
        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*): A tuple
            containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model
            and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
            A function that preprocess the logits right before caching them at each evaluation step. Must take two
            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
            by this function will be reflected in the predictions received by `compute_metrics`.
            Note that the labels (second parameter) will be `None` if the dataset does not have them.
    Important attributes:
        - **model** -- Always points to the core model. If using a transformers model, it will be a [`~transformers.PreTrainedModel`]
        subclass.
        - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
        original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`,
        the inner model is first wrapped in `ORTModule` and then in `DeepSpeed` and then again in
        `torch.nn.DistributedDataParallel`. If the inner model hasn't been wrapped, then `self.model_wrapped` is the
        same as `self.model`.
        - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from
        data parallelism, this means some of the model layers are split on different GPUs).
        - **place_model_on_device** -- Whether or not to automatically place the model on the device - it will be set
        to `False` if model parallel or deepspeed is used, or if the default
        `ORTTrainingArguments.place_model_on_device` is overridden to return `False` .
        - **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called while
        in `train`)
    NNNrG   rD   data_collatortrain_dataseteval_dataset	tokenizer
model_initcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricsc                    sb   t d t j|||||||||	|
|d |jr|  | _|| _| jj	r/t
j| jj	 d S d S )NzAThe ORTTrainer is deprecated and will be removed in Optimum v2.0.)rG   rD   r_   r`   ra   rb   rc   rd   re   rf   rg   )loggerwarningrA   rB   use_module_with_losscreate_model_with_loss_training_modelrG   rD   
local_rankrY   cuda
set_device)rF   rG   rD   r_   r`   ra   rb   rc   rd   re   rf   rg   rH   rJ   rK   rB      s(   

zORTTrainer.__init__c                 C   s&   t | j| j| j}ttj||_|S r@   )	r?   rG   rD   rE   types
MethodTyper   compute_lossrM   )rF   model_with_lossrJ   rJ   rK   rk     s   z!ORTTrainer.create_model_with_lossc                 C   sv   | j d u r9dd l}t| jtr|| jjj}n|| jj}t|j	
 | _ |  j ttddg| j 7  _ d S d S )Nr   label	label_ids)_signature_columnsinspect
isinstancerG   r?   	signaturerC   rP   list
parameterskeyssetlabel_names)rF   rw   ry   rJ   rJ   rK    _set_signature_columns_if_needed  s   
$z+ORTTrainer._set_signature_columns_if_neededFc                    s6   t | jtrt| }||||S t ||||S r@   )rx   rG   r?   dictitemsrA   rr   )rF   rs   rL   rN   rO   dict_inputsrH   rJ   rK   rr   ,  s   zORTTrainer.compute_lossresume_from_checkpointtrialzoptuna.Trialignore_keys_for_evalc                 K   s  t  std| jjr| j| _|du rd}| j  | j}d| _|j	s&|j
r1|js1| | j|j d|v r@|d}tdt t|dkrUtdd	t|  d
| | | jj| _d}| jdur| jjrpt| jjnt| jj | || _d}d\| _ | _!t"|t#r|rt$|j%}|du rt&d|j% d|durt' s| j(s| j)s| *| |r| j+r| | j|j | j| _,t-| j.| j|j/}|j0rzt12  |||||dW t13  S t13  w |||||dS )a  
        Main entry point for training with ONNX Runtime accelerator.

        Args:
            resume_from_checkpoint (`str` or `bool`, *optional*):
                If a `str`, local path to a saved checkpoint as saved by a previous instance of [`ORTTrainer`]. If a
                `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
                of [`ORTTrainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
                The trial run or the hyperparameter dictionary for hyperparameter search.
            ignore_keys_for_eval (`List[str]`, *optional*)
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions for evaluation during the training.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional keyword arguments used to hide deprecated arguments
        zYou need to install `onnxruntime-training` to use `ORTTrainer` for training. Check out https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer#install-onnx-runtime.FNT
model_pathzi`model_path` is deprecated and will be removed in a future version. Use `resume_from_checkpoint` instead.r   z3train() received got unexpected keyword arguments: z, .r^   z/No valid checkpoint found in output directory ())rD   r   r   r   )4r6   ImportErrorrD   rj   rl   rG   _memory_trackerstartis_in_trainfp16_full_evalbf16_full_evaldo_train_move_model_to_devicedevicepopwarningswarnFutureWarninglen	TypeErrorjoinrz   r|   _hp_search_setuptrain_batch_size_train_batch_sizerc   full_determinismr$   seedr(   call_model_init	optimizerlr_schedulerrx   boolr&   
output_dir
ValueErrorr/   is_deepspeed_enabledis_fsdp_enabled_load_from_checkpointplace_model_on_devicemodel_wrappedr%   _inner_training_loopauto_find_batch_sizepush_to_hubhf_hub_utilsdisable_progress_barsenable_progress_bars)rF   r   r   r   kwargsrD   model_reloadedinner_training_looprJ   rJ   rK   train5  s   






zORTTrainer.trainc           /         s@  ddl m} | j  || _td| j  |  }| j|j |j	 }d }	t
|rpt|}	|	|j }
t|
d}
| |}|jdkrY|j}|j|
 t|j|
 dk }|j| }n7t|j|
 }t|j}| ||j }n |jdkr|j}tj}|}
||j }|j| }ntd|j tj| jjv r| jjdkrtdt| j}t p| jp| j}td | jj rddl m!} || j|| jj | jj"d	}n|| j}|| _#|| _| j$rd | _%d
| _$| j&rt' rt(d|j)rt*+dt, || _t-| |d\| _.| _%|s| j/|d t0dd | j1j2| j3g D d| _4|d u| j4_5| j| j4_6|j7d urC|j7dk r>t||j7 | j4_7n|j7| j4_7|j8d ur_|j8dk rZt||j8 | j4_8n|j8| j4_8|j9d ur{|j9dk rvt||j9 | j4_9n|j9| j4_9|j:r| j;  | <| j#}t s| jr|d ur| =|| || ju rdnd
}|r|r| j>| j| _| j/|d |r| j?  | jj@tAjBk tC| j%dr| jDr| j>| j}n| j>| j| j.\}| _.n| j>| j| j.| j%\}| _.| _%tE|| _| jr|| _|| jur|| _#| j&r| j#| _F|d ur&| j&r&tG| j#| | H| td td|d td|d td| jjId | jjI| jkr_td| jd td|d td|j  td|d tdtJ|ddd d| j4_KtLL }d}d}d }|d urtMjNOtMjNP|tQrt0RtMjNP|tQ| _4| j4jS|
 }|jTs| j4jS|
 }||j9 }nd}td td|  td | j4jS  |jTstd!| d"| d# | j| j1_| j.| j1_.| j%| j1_%|| j1_U| jVd ur| jWd ur| V| jW| j4_X|d ur/| jYtZj[kr&|j\n|}t]|| j4_^nd | j4_^|| j4_|| j4_| _ | j4__| ` | j4_`tabd$c|jd}d$| _e| j4jS| _f|g  | j1h|| j4| j3| _3 fd%d& |jTsti|D ]#} |}tj|tk}|s|D ]} qu|d ur|ng }tl|}qud}ti||D ]F}|}|jmdkrd | _n|	d urt|n|j|j } | j1o|| j4| j3| _3||kr|d ur|dkr| p| d
}!d}"|dkrtq||}|}"d}d}!d'}#tr|D ]\}#}$|d7 }|!r	| p| d
}!|dkr(|d8 }|d ur|sd |dkr&| p| q|d ur3|t  d }|#|j dkrF| j1u|| j4| j3| _3| jv| | w||$}%W d    n	1 s^w   Y  |jxrty staz|%swta{|%r||d| j4jS | jf  7 }n||%7 }|  j|t}| ~|$7  _|| |jko|#d | k}&||j dks|&r{|&stttd(kr| jjd |jd ur|jdkrt r|jr| j.|j}'n&tC| j.d)r| j.|j}'ntC|d*r||j}'n
| j| |j}'t r| jj@tAjBkr| }(n|'d ur|' nd }(| j.  | jj })|)r;tj| j%tajj%js;| j%  |g  | j4 jSd7  _S||#d |" |   | j4_K| j1|| j4| j3| _3td+d,rp| ||(||||| n| ||(|||| n| j1|| j4| j3| _3| j3js| j3jr nq|#dk rtd-| j4jS d.| d/ d| j3_| j1|| j4| j3| _3td+d,r| ||(||||| n
| ||(|||| tj| jjv rtd0 | j3jr nq|jmrtC| d1rt| d1 td2 |jr| j4jd ur|jtjkrt  |   |  je| 7  _e| je| j4jS }*td3||| j4jd4}+|   | j4j|+d5< |*|+d6< d
| _| j|+ | |+ | |},| jd
|,d7}-| jjr| j4jd ur| jjdkr|-D ]}.tMjN|.| j4jstd8|. d9 t|. qm| j1|| j4| j3| _3|   t| j4jS|*|+S ):Nr   	ORTModulez)Currently training with a batch size of: r3   zYargs.max_steps must be set to a positive value if dataloader does not have a length, was zjCurrently --debug underflow_overflow is not supported under DP. Please use DDP (torch.distributed.launch).z)Wrap ORTModule for ONNX Runtime training.)DebugOptions)	save_onnxonnx_prefixFze`ORTTrainer` does not support ZeRO stage 3 for the moment. Please use DeepSpeed stage 1 or 2 instead.zONNX Runtime doesn't support BF16 when executing some operators. The execution will fail if there are any op which doesn't support BF16 in the IR.)num_training_stepsc                 S   s   g | ]	}t |tr|qS rJ   )rx   r   ).0cbrJ   rJ   rK   
<listcomp>  s
    
z3ORTTrainer._inner_training_loop.<locals>.<listcomp>)stateful_callbacksTstepz***** Running training *****z  Num examples = ,z  Num Epochs = z(  Instantaneous batch size per device = zA  Training with DataParallel so batch size has been adjusted to: zE  Total train batch size (w. parallel, distributed & accumulation) = z   Gradient Accumulation steps = z  Total optimization steps = z#  Number of trainable parameters = )trainable_onlyzE  Continuing training from checkpoint, will skip to saved global_stepz!  Continuing training from epoch z'  Continuing training from global step z  Will skip the first z epochs then the first z batches in the first epoch.        c                    s2   t | dr| jd ur | jS t | dr| jS d S )Nbatch_samplersampler)hasattrr   r   )
dataloaderget_dataloader_samplerrJ   rK   r     s
   

z?ORTTrainer._inner_training_loop.<locals>.get_dataloader_samplerz0.20.3clip_grad_normclip_grad_norm_r8   z4.47.0zZThere seems to be not a single sample in your train dataloader, stopping training at step zI! This is expected if you're using an IterableDataset and set num_steps (z.) higher than the number of available samples.zYou enabled PyTorch/XLA debug metrics which is not supported by ONNX Runtime. Check your training configuration if this is unexpected._pastzU

Training completed. Do not forget to share your model on huggingface.co/models =)

r   )num_samples	num_steps
total_flos
train_loss)	use_mtimer   zDeleting older checkpoint [z] due to args.save_total_limit)	torch_ortr   acceleratorfree_memoryr   rh   debugget_train_dataloadergradient_accumulation_steps
world_sizer'   r   maxnum_examples	max_stepsintmathceilnum_train_epochssysmaxsizer   r   UNDERFLOW_OVERFLOWrD   n_gpur   rG   r/   is_fsdp_xla_enabledr   infor   r   r   r   _created_lr_schedulerr   r   r;   NotImplementedErrorbf16r   r   RuntimeWarningr9   r   create_optimizer_and_schedulerr   callback_handlerre   controlstateis_hyper_param_searchr   logging_steps
eval_steps
save_stepsgradient_checkpointinggradient_checkpointing_enable_wrap_modelr   preparer   distributed_typer   	DEEPSPEEDr   use_apexr   	deepspeedr:   _load_optimizer_and_schedulerper_device_train_batch_sizer   epochtimeospathisfiler   TRAINER_STATE_NAMEload_from_jsonglobal_stepignore_data_skiptrain_dataloaderhp_name_trial
trial_namehp_search_backendr"   SIGOPTassignmentsr
   trial_paramsis_local_process_zerois_world_process_zerorY   tensortor   _total_loss_scalar_globalstep_last_logged	zero_gradon_train_beginrangerx   r   rz   
past_indexr   on_epoch_begin_load_rng_stater   	enumerateupdatecloseon_step_begin
accumulatetraining_steplogging_nan_inf_filteris_torch_tpu_xla_availableisnanisinfcurrent_flosfloatfloating_point_opsr   parseaccelerate_versiongradient_state_set_sync_gradientsmax_grad_normfp16clip_master_gradsr   r   r{   r   get_global_grad_normitemr   optimizer_step_was_skippedoptimReduceLROnPlateauon_step_endr2   _maybe_log_save_evaluateon_substep_endshould_epoch_stopshould_training_stopri   on_epoch_endTPU_METRICS_DEBUGdelattrload_best_model_at_endbest_model_checkpointparallel_moder*   DISTRIBUTEDdistbarrier_load_best_modelr)   
store_flosr   r   r   stop_and_update_metricslog_get_output_dir_sorted_checkpointsshould_savesave_total_limitsamefileshutilrmtreeon_train_end_finish_current_pushr#   )/rF   
batch_sizerD   r   r   r   r   r  total_train_batch_sizelen_dataloadernum_update_steps_per_epochr   r   r   num_train_samplesdebug_overflowdelay_optimizer_creationr   rG   use_accelerator_prepare
start_timeepochs_trainedsteps_trained_in_current_epochsteps_trained_progress_barr	  tr_lossr   r   is_random_sampler_total_batched_samplesepoch_iteratorsteps_in_epochrng_to_syncsteps_skippedr   rL   tr_loss_step)is_last_step_and_steps_less_than_grad_acc
_grad_norm	grad_normoptimizer_was_runr   metricsrun_dircheckpoints_sorted
checkpointrJ   r   rK   r     s~  
















































&
zORTTrainer._inner_training_loopTc                    s  | j jr| jr
tjntj}| j|||d}t rtdt	||ur/ddl
m} t||s/|S | jrR|rRtj|| j| j jd\}| _| j jrRddlm} || j| _| j jdkrct|dd	sct|}| j jr|t }| |||}tt | d
| _|s|S | jrzddlm   ddlm! ddl"m#}m$}	 W n t%y   t%dw d }
d }t|dd }| j j&'d|}| j j&d dkrt(j)|| j j&d d}
n%|d urt* }|D ]}t+||}|d u rt,d|-| qt(j)|	|d}
| j j.}| j j&d r fdd} |f|
|d| | _/}d	i fdd}|t0_1|S t2 r%td| j j3t4j5krqi }| j j6d ur=| j j6|d< nt|t7rJ|j8 |d< nd|d< | j j9d ur[| j j9|d< | j j:d urh| j j:|d< t;d i || j<_=|S )!N)dtypezQSagemaker's distrubuted data parallel features are not supported by `ORTTrainer`.r   r   )	opt_level)FP16_Optimizerr3   is_loaded_in_8bitF   )XlaFullyShardedDataParallel)checkpoint_module)size_based_auto_wrap_policytransformer_auto_wrap_policyzJMissing XLA FSDP related module; please make sure to use torch-xla >= 2.0._no_split_modulestransformer_layer_cls_to_wrapmin_num_params)rs  z@Could not find the transformer layer class to wrap in the model.)transformer_layer_clsxla_fsdp_grad_ckptc                    s    | g|R i |S r@   rJ   )mrD   r   FSDPrn  rJ   rK   auto_wrapper_callable  s   z5ORTTrainer._wrap_model.<locals>.auto_wrapper_callable)auto_wrap_policyry  c                 S   s    | j di |}|rt  |S )NrJ   )r   xm	mark_step)r   r=  optimizer_argslossrJ   rJ   rK   patched_optimizer_step  s   z6ORTTrainer._wrap_model.<locals>.patched_optimizer_stepzUSagemaker's distrubuted data parallel features are not supported by `ORTTrainer` yet.find_unused_parametersTbucket_cap_mbbroadcast_buffersrJ   )>rD   use_ipexuse_cpu_amprY   bfloat16float32ipex_optimize_modelr/   r   r   r   r   rx   r   r7   
initializer   fp16_opt_levelr)  )onnxruntime.training.optim.fp16_optimizerrj  r   getattrr   DataParalleljit_mode_evalr   torch_jit_model_evalroundjit_compilation_timer   torch_xla.distributed.fsdprm  rn  torch_xla.distributed.fsdp.wrapro  rp  r   fsdp_configget	functoolspartialr}   r   	Exceptionaddxla_fsdp_configrG   r{  optimizer_stepr.   r:  r*   r;  ddp_find_unused_parametersr   is_gradient_checkpointingddp_bucket_cap_mbddp_broadcast_buffersr   r   ddp_handler)rF   rG   trainingr   rh  r   rj  rS  ro  rp  rz  ry  %default_transformer_cls_names_to_wrap"fsdp_transformer_layer_cls_to_wraptransformer_cls_to_wraplayer_classtransformer_clsfsdp_kwargsr  r   rJ   rw  rK   r   y  s   



	zORTTrainer._wrap_modelc           	         sx  t  r| jn| j}| jdu rt|tjg dd  D   fdd| D | jj	d fdd| D ddg}| jj
tv rIt| j\}}nt| j\}}||fi || _|jdkrd	dl}|j
j }d	}| D ]7}t|tjr|td
d | D  7 }td| d|d  d ||dddi td| d qotd|d  d t  rtd| jS )a  
        Setup the optimizer.

        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
        ORTTrainer's init through `optimizers`, or subclass and override this method in a subclass.
        Nc                 S   s   g | ]}d |vr|qS )biasrJ   )r   namerJ   rJ   rK   r     s    z/ORTTrainer.create_optimizer.<locals>.<listcomp>c                    s   g | ]
\}}| v r|qS rJ   rJ   r   npdecay_parametersrJ   rK   r         )paramsweight_decayc                    s   g | ]
\}}| vr|qS rJ   rJ   r  r  rJ   rK   r   	  r  r   Adam8bitr   c                 S   s   i | ]	}|  | qS rJ   )data_ptrnumel)r   r  rJ   rJ   rK   
<dictcomp>  s    z/ORTTrainer.create_optimizer.<locals>.<dictcomp>zskipped z: i   zM paramsweight
optim_bits    zbitsandbytes: will optimize z in fp32z	skipped: zUSagemaker's distributed data parallel features are not supported by `ORTTrainer` yet.)r/   r   rG   r   r    r   	LayerNormnamed_parametersrD   r  r.  r4   r]    get_ort_optimizer_cls_and_kwargsr   get_optimizer_cls_and_kwargsrU   bitsandbytesGlobalOptimManagerget_instancemodulesrx   	Embeddingsumr{   valuesrh   r   register_module_overrider   r   )	rF   	opt_modeloptimizer_grouped_parametersoptimizer_clsoptimizer_kwargsr  managerskippedrR   rJ   r  rK   create_optimizer  s@   

zORTTrainer.create_optimizerreturnc                 C   sv   d| j i}| j| jf| jd}| jtjkr3zddlm} |}|	| W ||fS  t
y2   t
dw td| j )z
        Returns the optimizer class and optimizer parameters implemented in ONNX Runtime based on `ORTTrainingArguments`.

        Args:
            args (`ORTTrainingArguments`):
                The training arguments for the training session.
        lr)betasepsr   )	FusedAdamzbORTTrainer tried to instantiate ORT FusedAdam but onnxruntime-training is not correctly installed!z5ORTTrainer cannot instantiate unsupported optimizer: )learning_rate
adam_beta1
adam_beta2adam_epsilonr.  r4   ADAMW_ORT_FUSEDonnxruntime.training.optimr  r  r   r   )rD   r  adam_kwargsr  r  rJ   rJ   rK   r  )  s   
	z+ORTTrainer.get_ort_optimizer_cls_and_kwargsr   c                 C   s   |d ur|n| j j}tj|dd td|  tf}t| j|sl|d u r+| j	 }t| j
| j|rE| j
| jj||| j jd n2td | j jr`tjj| jtj|tddid nt|tj|t n| jj||| j jd | jd ur| j| t| j tj|t d S )	NT)exist_okzSaving model checkpoint to )
state_dictsafe_serializationzETrainer.model is not a `PreTrainedModel`, only saving its state dict.formatpt)metadata)rD   r   r   makedirsrh   r   r   rx   rG   r  r   r   save_pretrainedsave_safetensorssafetensorsrY   
save_modelr   r   r+   saver,   processing_classTRAINING_ARGS_NAME)rF   r   r  supported_classesrJ   rJ   rK   _saveB  s.   




zORTTrainer._save)NNNNNNNNNr^   N)FN)NNN)NNNNN)TN)(rU   rV   rW   __doc__r	   r   r   Moduler5   r   r   r   r   rX   r   r   r!   r   r   r   rY   r.  	Optimizerr   LambdaLRrZ   rB   rk   r   rr   r   r   r   r   r   r  staticmethodr  r  r\   rJ   rJ   rH   rK   r]      sv    N	

-

q
   
W~2r]   )or  r  r   r   rG  r   r   rp   r   typingr   r   r   r   r   r   r   r	   r  transformers.integrationsr
   transformers.utilsr   	packagingr   
accelerater   r%  accelerate.utilsr   r$  r   r   r   huggingface_hub.utilsutilsr   rY   torch.distributeddistributedr<  r   torch.utils.datar   r   transformers.data.data_collatorr   transformers.debug_utilsr   r   transformers.modeling_utilsr   r   $transformers.tokenization_utils_baser   transformers.trainerr   transformers.trainer_callbackr   r   r   transformers.trainer_pt_utilsr   r   r    transformers.trainer_utilsr!   r"   r#   r$   r%   r&   r'   r(   r)   transformers.training_argsr*   r+   r,   r-   r.   r/   r1   utils.import_utilsr2   training_argsr4   r5   r6   apexr7   #transformers.integrations.deepspeedr9   r:   r;   transformers.deepspeedr<   r  torch_xla.core.xla_modelcore	xla_modelr{  r=   optunar   r  
get_loggerrU   rh   r?   r]   rJ   rJ   rJ   rK   <module>   s~   (,
