
    )jr/              	       &   d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z
 d dlmZ d dlZd dlmZ d dlmZmZ d dlmZ dd	lmZ dd
lmZ d Ze G d d                      Zd Z	 	 	 ddZdeefdedefdZd e            eedfdedededefdZdS )    N)	dataclassfield)partial)Path)average_gradients)tree_flattentree_map)tqdm   )TrainingCallback)CacheDatasetc                 b    t          |           j        fd}|t          |           _        dS )zL
    Update all instances of type(layer) to use gradient checkpointing.
    c                 r      fd} t          j        |                                           g|R i |S )Nc                 F                         |             g|R i |S N)update)paramsargskwargsfnmodels      ^/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/tuner/trainer.pyinner_fnz:grad_checkpoint.<locals>.checkpointed_fn.<locals>.inner_fn   s5    LL   2e-d---f---    )mx
checkpointtrainable_parameters)r   r   r   r   r   s   `   r   checkpointed_fnz(grad_checkpoint.<locals>.checkpointed_fn   sZ    	. 	. 	. 	. 	. 	. 'r}X&&u'A'A'C'CUdUUUfUUUr   N)type__call__)layerr   r   s     @r   grad_checkpointr"      sG     
e	BV V V V V +DKKr   c                      e Zd ZU  edddi          Zeed<    edddi          Zeed<    ed	dd
i          Zeed<    edddi          Z	eed<    edddi          Z
eed<    edddi          Zeed<    edddi          Zeed<    edddi          Zeed<    edddi          Zeed<    edddi          Zeed<   d S )!TrainingArgs   helpzMinibatch size.)defaultmetadata
batch_sized   zIterations to train for.iters   z@Number of validation batches, -1 uses the entire validation set.val_batches
   z0Number of training steps between loss reporting.steps_per_report   z-Number of training steps between validations.steps_per_evalz!Save the model every number stepssteps_per_save   zMaximum sequence length.max_seq_lengthzadapters.safetensorsz/Save/load path for the trained adapter weights.adapter_fileFz0Use gradient checkpointing to reduce memory use.r"   r   zLNumber of steps to accumulate gradients before applying an optimizer update.grad_accumulation_stepsN)__name__
__module____qualname__r   r)   int__annotations__r+   r-   r/   r1   r2   r4   r5   strr"   boolr6    r   r   r$   r$   $   s	        eA9J0KLLLJLLLsf6P-QRRRE3RRRuV
  K    "ELM  c     %v'VW  NC     %v'JK  NC     %(BC  NC    &KL  L#    "ELM  OT    $)5b
$ $ $S     r   r$   c                    |d d d df         }|d d dd f         } | |          }t          j        d|j        d         dz             }t          j        ||d d ddf         k    ||d d dd f         k              }t          j                            ||          |z  }|                                }	|                    t           j	                                                  |	z  }||	fS )Nr   r   )
r   arangeshapelogical_andnnlossescross_entropysumastypefloat32)
r   batchlengthsinputstargetslogitsstepsmaskcentokss
             r   default_lossrS   K   s    111crc6]FAAAqrrElGU6]]FIaq)A-..E>%7111ac6?2EWQQQU^4KLLD		 	 	1	1D	8BHHJJE	2:			"	"	$	$u	,Bu9r   Fc           
   #      K   t           t                    r fd}n fd}t          t          t	                               |          t	                     k     r#t          d dt	                      d          |)|                                |                                nddz  dk    rt          d	          fd
t          dt	                    z
  dz             D             }|rt          j	        
                    |           	 t          j	                            t	          |                    }|D ]w}	 fd||	         D             }
t	          |
d                   dk    rt          |
 \  }
}ndgt	          |
          z  }d |
D             }t          |          |k    r&t          d| dt          |           d| d           d}d|t          |          |z   dz
  |z  z  z   }t          ||          }t          j        z  |ft          j                  }t          z            D ]4}t          ||         |          }|
|         d |         ||d |f<   |||<   5t%          j        |          }
|
t%          j        t)          t          ||                              fV  y|sd S )Nc                 .                         |           S r   )itemlenidxdatasets    r   <lambda>z!iterate_batches.<locals>.<lambda>e   s    W__S11 r   c                 :    t          |          d                   S )Nr   lenrW   s    r   rZ   z!iterate_batches.<locals>.<lambda>g   s    Sa11 r   )keyz&Dataset must have at least batch_size=z examples but only has .r   r   z9The batch size must be divisible by the number of workersc                 8    g | ]}|z   |z   z            S r>   r>   ).0ir)   rX   offsetsteps     r   
<listcomp>z#iterate_batches.<locals>.<listcomp>{   sB        	AJVj0478  r   Tc                      g | ]
}|         S r>   r>   )ra   jrY   s     r   re   z#iterate_batches.<locals>.<listcomp>   s    666AWQZ666r      c                 ,    g | ]}t          |          S r>   r\   )ra   xs     r   re   z#iterate_batches.<locals>.<listcomp>   s    ---!s1vv---r   z)[WARNING] Some sequences are longer than z tokens. The longest sentence z will be truncated to z2. Consider pre-splitting your data to save memory.    )
isinstancer   sortedranger]   
ValueErrorranksizenprandomseedpermutationzipmaxprintminzerosint32r   arraylist)rY   r)   r4   looprt   
comm_grouplen_fn	batch_idxindicesrb   rJ   offsetsrK   pad_tomax_length_in_batch	batch_arrrg   truncated_lengthrX   rc   rd   s   ``                @@@r   iterate_batchesr   [   st      '<(( 211111111
s7||$$&
1
1
1C
7||j  6Z 6 6&)'ll6 6 6
 
 	
 ""  DATUUU      q#c((Z/!3Z@@  I  
	t!)''I77 	? 	?A66661666E58}}!!!$eww#E

*--u---G7||n,,G G G,/LLG GP^G G G   F"#fW1F1Jv0U&V"V"%&9>"J"J*"46I!JBHUUI:-..  #&wqz>#B#B 27(;L<L;L2M	!..../$ 

 HY''E$s7G'<'<"="=>>>>>>> 	EC!r   r3   lossr   c                    |                                   t          j        d          }t          j        d          }|dk    rt          t	          |                    nt          t
          d          }	t          t          |	 ||||t          j        	                                                    dt          t          |          |z  |                    D ]3\  }
} || g|R  \  }}|||z  z  }||z  }t          j         ||           4t          j                            |t          j                  }t          j                            |t          j                  }||z                                  S )	Ng        r   r@   r   )rY   r)   r4   r   zCalculating loss...)desctotalstream)evalr   r|   iterrn   r:   r
   rv   distributedinitry   r]   all_sumcpuitem)r   rY   r)   num_batchesr4   r   r   
all_lossesntokensindex_iterator_rJ   rE   tokss                 r   evaluater      sm    
JJLLL#JhqkkG1<1B1BT%,,---SRSNO%->..00	  	
 	
 ##g,,*,k::   % %5 tE*E***ftm#
4

G$$$$''
26'BBJn$$WRV$<<G &&(((r   r   training_callbackc                     #$ t           j                                        r+t          j        t          j                    d                    t          d|j                    t           j                                        }|	                                }	|
                                }
|	dk    rt          d|
 d|	            |j        rt           j        d                    t          j         |          $|j        ##dk     rt!          d           j        j        t           j        j        g}t'          t           j        ||          #$ fd	            }                                  d}d}d}d}d}d }t-          t/          d|j        dz              |||j        |j        d
|                    D ]Y\  }}t5          j                    }|r|dk    s||j        z  dk    s||j        k    rt5          j                    }t;           |||j        |j        |j        |          }                                  t5          j                    |z
  }|
dk    rt          d| d|dd|ddd
           ||dz
  ||d}|                    |           t5          j                    } ||||#z  dk              \  }}}||z  }||z  }|dz  }t          j         ||||           |t5          j                    |z
  z  }||j!        z  dk    s||j        k    r-t           j        "                    |t           j#                  $                                }|||	z  z  }t           j        "                    |t           j#                  $                                }j%        $                                }|j!        |z  }tM          |          |z  }||z  }t          j'                    dz  }|
dk    r,t          d| d|dd|dd|dd|dd| d|ddd
           ||||||||d} |(                    |            d}d}d}d}||j)        z  dk    r|
dk    rtU          tW           ,                                                    }!t          j-        t]          |j/                  |!           ta          |j/                  j1        |dd z  }"t          j-        t]          |"          |!           t          d| d!|j/         d"|" d#           [|
dk    rotU          tW           ,                                                    }!t          j-        t]          |j/                  |!           t          d$|j/         d#           d S d S )%N max_recommended_working_set_sizezStarting training..., iters: r   zNode z of r   z*grad_accumulation_steps must be at least 1)rL   outputsc                      g| R  \  \  }}}|t          d ||          }|r@t          |          }dk    rt          fd|          }	                    |           d }|||fS )Nc                     | |z   S r   r>   )rj   ys     r   rZ   z%train.<locals>.step.<locals>.<lambda>   s
    Q r   r   c                     | z  S r   r>   )rj   grad_accum_stepss    r   rZ   z%train.<locals>.step.<locals>.<lambda>   s    !.>*> r   )r	   r   r   )
rJ   	prev_grad	do_updatelvaluer   gradr   loss_value_and_gradr   	optimizers
         r   rd   ztrain.<locals>.step   s    225A5AAA ..i@@D 	$T**D!## > > > >EEUD)))DtT!!r   T)rY   r)   r4   r~   r   )r   rY   r   r)   r   r4   r   zIter z: Val loss z.3fz, Val took s)flush)	iterationval_lossval_timer   g    eAz: Train loss z, Learning Rate z.3ez	, It/sec z, Tokens/sec z, Trained Tokens z, Peak mem z GB)r   
train_losslearning_rateiterations_per_secondtokens_per_secondtrained_tokenspeak_memory07dz_adapters.safetensorsz: Saved adapter weights to z and r_   zSaved final weights to )2r   metalis_availableset_wired_limitdevice_inforx   r+   r   r   rq   rp   r"   layersrD   value_and_gradr6   ro   staters   r   compiletrainrv   rn   r)   r4   timeperf_counterr1   r   r-   on_val_loss_reportr   r/   r   r   r   r   floatget_peak_memoryon_train_loss_reportr2   dictr   r   save_safetensorsr<   r5   r   parent)%r   r   train_datasetval_datasetr   r   r   r   world
world_sizerp   r   rd   rE   n_tokensrO   r   
train_time
grad_accumitrJ   ticr   r   val_infor   r   r   r   it_sec
tokens_secpeak_mem
train_infoadapter_weightsr   r   r   s%   ``                                 @@r   r   r      s    
x Q
2>++,NOPPP	
6$*
6
6777N!!EJ::<<DA~~,d,,
,,--- )Q(((+E4883!EFFF[)/29?;ERZu555" " " " " " " 65" 
KKMMMFHENJJ aa  !.	
 	
 	
	 	 k k	E !!  	&!GGrD//144dj8H8H#%%C#? ,#2 /  H KKMMM(**S0Hqyy0B 0 0 (00 0 (/0 0 0 	    !,!#a ( ( 
 "44X>>>#%%C#'4!!Q&$
 $
 j 	&D

vx444d'))C//
 %%**bDJ.>.>//rv/FFKKMMJ%*,,J~--hrv-FFKKMMH%388::M*Z7Fx:5Jh&N)++c1Hqyy2B 2 2Z? 2 2%2:2 2$,2 2 #-42 2 '5	2 2
 !)12 2 2     !,!#",%2-3)3&4#+ 
 "66zBBBFHEJ ##q((TQYY"<0J0J0L0L#M#MNNOD$5 6 6HHHT&''.B1Q1Q1Q1QQ  JAAA9 9 9$9 9+59 9 9   qyy|E,F,F,H,HIIJJ
C 122ODDD<(9<<<===== yr   )FNN) r   dataclassesr   r   	functoolsr   pathlibr   mlx.corecorer   mlx.nnrD   numpyrr   mlx.nn.utilsr   	mlx.utilsr   r	   r
   	callbacksr   datasetsr   r"   r$   rS   r   callabler   r   r>   r   r   <module>r      s    ( ( ( ( ( ( ( (                             * * * * * * , , , , , , , ,       ' ' ' ' ' ' " " " " " "+ + +  # # # # # # # #L  ( 
	G G G G^ ! /$) $) $) $) $) $) $)V %! /*.h> h>
 h> h> h> (h> h> h> h> h> h>r   