
    j/              
          d dl mZmZ d dlmZmZmZ d dlmZ	 ddl
mZmZmZmZ ddlmZ ded	efd
Zdded	ee         fdZd Zd Z	 	 	 ddedee	j        j                 dedee	j                 fdZddZ	 	 	 	 	 ddZdS )    )reducewraps)AnyCallableOptionalN   )tree_flattentree_maptree_reducetree_unflatten   )Modulemodelfnc                 v      fd}t          j        |          t                     fd            }|S )a  Transform the passed function ``fn`` to a function that computes the
    gradients of ``fn`` wrt the model's trainable parameters and also its
    value.

    Args:
        model (mlx.nn.Module): The model whose trainable parameters to compute
                               gradients for
        fn (Callable): The scalar function to compute gradients for

    Returns:
        A callable that returns the value of ``fn`` and the gradients wrt the
        trainable parameters of ``model``
    c                 >                         |             |i |S Nupdate)paramsargskwargsr   r   s      V/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx/nn/utils.pyinner_fnz value_and_grad.<locals>.inner_fn   s*    Vr4"6"""    c                  N                                      g| R i |\  }}||fS r   trainable_parameters)r   r   valuegradr   value_grad_fns       r   wrapped_value_grad_fnz-value_and_grad.<locals>.wrapped_value_grad_fn!   s<    #mE$>$>$@$@R4RRR6RRtd{r   )mxvalue_and_gradr   )r   r   r   r"   r!   s   ``  @r   r$   r$      sk    # # # # # # %h//M
2YY     Y ! r   modulec                 ~       fd}t          j        |          t                     fd            }|S )aP  Transform the passed callable to one that performs gradient
    checkpointing with respect to the trainable parameters of the module (and
    the callable's inputs).

    Args:
        module (mlx.nn.Module): The module for whose parameters we will be
            performing gradient checkpointing.
        fn (Callable, optional): The function to checkpoint. If not provided it
            defaults to the provided module.

    Returns:
        A callable that saves the inputs and outputs during the forward pass
        and recomputes all intermediate states during the backward pass.
    Nc                 >                         |             |i |S r   r   )r   r   r   r   r%   s      r   r   zcheckpoint.<locals>.inner_fn=   s*    fr4"6"""r   c                  @                                      g| R i |S r   r   )r   r   checkpointed_fnr%   s     r   wrapped_checkpointed_fnz+checkpoint.<locals>.wrapped_checkpointed_fnC   s/    v::<<NtNNNvNNNr   )r#   
checkpointr   )r%   r   r   r*   r)   s   ``  @r   r+   r+   )   s~     
z # # # # # # mH--O
2YYO O O O O YO #"r   c                 n    d | D             }d | D             }d | D             }d | D             }||||fS )Nc                     g | ]\  }}|S  r.   ).0k_s      r   
<listcomp>z!_extract_info.<locals>.<listcomp>K   s    $!QAr   c                 "    g | ]\  }}|j         S r.   )shaper/   r1   gs      r   r2   z!_extract_info.<locals>.<listcomp>L       '''$!Qag'''r   c                 "    g | ]\  }}|j         S r.   )sizer5   s      r   r2   z!_extract_info.<locals>.<listcomp>M   s    %%%1QV%%%r   c                 "    g | ]\  }}|j         S r.   )dtyper5   s      r   r2   z!_extract_info.<locals>.<listcomp>N   r7   r   r.   )flatkeysshapessizesdtypess        r   _extract_inforA   J   s]    $D''$'''F%%%%%E''$'''F&&r   c                    g }g }d}t          t          |                     D ]D}|                    |           |||         |z  z  }||k    r|                    |           g }d}E|r|                    |           g }|S )Nr   )rangelenappend)r=   r?   itemsizecommunication_sizegrad_groups
grad_groupgrad_group_sizeis           r   _group_by_sizerL   R   s    KJO3t99    !58h..000z***JO :&&&
r      	gradientsgroupall_reduce_sizecommunication_streamc                 >  	
 pt           j                                                                        dk    r| S |dk    rt	          fd|           S t          |           t                    dk    r| S t                    \  
t          
fd
D                       st          | d          S t          
d         j        |          }g }|D ]}t          fd|dg          }t          j        fd|D                       	t           j                            	          z  	t          j        	|dd                   	|                    	fd	t!          |          D                        t#          |          S )
a  Average the gradients across the distributed processes in the passed group.

    This helper enables concatenating several gradients of small arrays to one
    big all reduce call for better networking performance.

    Args:
        gradients (Any): The Python tree containing the gradients (it should
            have the same structure across processes)
        group (Optional[mlx.core.distributed.Group]): The group of processes to
            average the gradients. If set to ``None`` the global group is used.
            Default: ``None``.
        all_reduce_size (int): Group arrays until their size in bytes exceeds
            this number. Perform one communication step per group of arrays. If
            less or equal to 0 array grouping is disabled. Default: ``32MiB``.
        communication_stream (Optional[mlx.core.Stream]): The stream to use
            for the communication. If unspecified the default communication
            stream is used which can vary by back-end. Default: ``None``.
    r   r   c                 N    t           j                            |           z  S )NrO   stream)r#   distributedall_sum)xNrQ   rO   s    r   <lambda>z#average_gradients.<locals>.<lambda>   s1    bn,,+ -  
  r   c              3   0   K   | ]}|d          k    V  dS )r   Nr.   )r/   dtr@   s     r   	<genexpr>z$average_gradients.<locals>.<genexpr>   s+      44r2?444444r   c                 .    | | d         |         z   gz   S )Nr.   )rX   yr?   s     r   rZ   z#average_gradients.<locals>.<lambda>   s    !quuQx/?.@*@ r   c                 R    g | ]#}|         d                               d          $S r   r_   reshape)r/   rK   
flat_gradss     r   r2   z%average_gradients.<locals>.<listcomp>   s0    BBB!Aq!))"--BBBr   )rU   rO   r_   c              3   p   K   | ]0\  }}|         |                              |                   fV  1d S r   rc   )r/   rK   jbig_gradr=   r>   s      r   r]   z$average_gradients.<locals>.<genexpr>   sZ       " "Aq a(1+--fQi889" " " " " "r   )r#   rV   initr9   r
   r	   rD   rA   allaverage_gradientsrL   r   concatenaterW   splitextend	enumerater   )rN   rO   rP   rQ   rH   new_flat_gradsrI   indicesrY   rh   r@   re   r=   r>   r?   s    ` `    @@@@@@@r   rk   rk   c   s   0 *R^((**E

AAvv!      
 
 	
 "),,
z??a '4J&?&?#feV 4444V44444 	:$Yq999$T5&)./RR % 	 	J@@@@*qcRRG~BBBBzBBB H &&%9 '     x'!B$-88H!! " " " " " "%j11" " "    
 n---r   c                     t          d | d          }t          j                            ||          }t          j        |          }t          j        ||dz   z  d          t          fd|           } | |fS )Nc                 T    | |                                                                 z   S r   )squaresum)accr6   s     r   rZ   z"_clip_grads_fsdp.<locals>.<lambda>   s    sQXXZZ^^5E5E/E r   g        rO   gư>g      ?c                     | z  S r   r.   )r6   
normalizers    r   rZ   z"_clip_grads_fsdp.<locals>.<lambda>   s    Q^ r   )r   r#   rV   rW   sqrtminimumr
   )grads_slicemax_normrO   local_norm_sqglobal_norm_sq	grad_normry   s         @r   _clip_grads_fsdpr      s     E E{TWXXM^++M+GGN''IH	D(893??J3333[AAK	!!r   c           	        !"#$ |pt           j                                        }|                                ||                                ndz  }|dk    rC|+t	          | |          \  } }	|                    | |          |	fS |                    | |          S t          |           "t          |          #t          "          \  }
}$}|d         j        }t          |
$||          }|                                !|	                                }i }i }t          |          D ]\  }}t          j        !"fd|D             d          }t           j                            |||          |z  ||<   |+t           j                            ||         ||          ||<   t          j        !#fd|D             d          }||         ||<   d}	|t	          |||          \  }}	|                    ||          }g }t          |          D ]\  }}t           j                            ||         ||          }!$fd	|D             }g }d}|D ]}||z  }|                    |           t          j        ||dd
         d          }t          |          D ]A\  }}|                    |
|         ||                             ||                   f           Bt%          |          } || |	fS | S )a  Perform a distributed optimizer step by sharding gradients and optimizer states across ranks.

    This helper function performs the following steps:
    1. Reduce-scatter the gradients across ranks so each rank gets a shard of the averaged gradients.
    2. Optionally clip the sharded gradients by global norm.
    3. Apply the optimizer update on the local parameter slice using the sharded gradients.
    4. All-gather the updated parameter slices from all ranks to reconstruct the full parameters tree.

    This is similar to PyTorch's FSDP with `reshard_after_forward=False`.

    Args:
        gradients (Any): The Python tree containing the full gradients (it should
            have the same structure as ``parameters``). Each gradient's first
            dimension must be divisible by ``fsdp_group.size()``.
        parameters (Any): The Python tree containing the full parameters (it should
            have the same structure across processes). Each parameter's first
            dimension must be divisible by ``fsdp_group.size()``.
        optimizer: Optimizer with an ``apply_gradients`` method.
        fsdp_group (Optional[mlx.core.distributed.Group]): The group of processes
            for FSDP sharding. If ``None``, the global group is used.
        dp_group (Optional[mlx.core.distributed.Group]): The group of processes
            for data-parallel gradient averaging. Required when ``fsdp_group`` is
            smaller than the world (e.g. FSDP intra-node, DDP inter-node).
            Default: ``None``.
        communication_size (int): Group arrays until their size in bytes exceeds
            this number. Perform one communication step per group of arrays. If
            less or equal to 0 array grouping is disabled. Default: ``32MiB``.
        communication_stream (Optional[mlx.core.Stream]): The stream to use
            for the communication. If unspecified the default communication
            stream is used which can vary by back-end. Default: ``None``.
        max_norm (Optional[float]): If provided, clip gradients to this
            maximum global norm before applying the optimizer update.
            Default: ``None``.

    Returns:
        If ``max_norm`` is ``None``, returns the updated full-parameter tree.
        Otherwise returns ``(parameters, grad_norm)``, where ``grad_norm`` is
        the global gradient norm before clipping.

    Example:

        >>> optimizer = optim.SGD(learning_rate=0.01)
        >>> # Without gradient clipping
        >>> updated_params = fsdp_apply_gradients(grads, params, optimizer)
        >>> model.update(updated_params)
        >>>
        >>> # With gradient clipping
        >>> updated_params, grad_norm = fsdp_apply_gradients(
        ...     grads, params, optimizer, max_norm=1.0
        ... )
        >>> model.update(updated_params)
    Nr   r   c                 T    g | ]$}|         d                               d          %S rb   rc   )r/   rK   Sre   s     r   r2   z(fsdp_apply_gradients.<locals>.<listcomp>  s2    @@@Z]1%%a,,@@@r   )axisrT   c                 T    g | ]$}|         d                               d          %S rb   rc   )r/   rK   r   flat_paramss     r   r2   z(fsdp_apply_gradients.<locals>.<listcomp>  s2    AAA![^A&&q"--AAAr   rw   c                 &    g | ]}|         z  S r.   r.   )r/   rK   r   r?   s     r   r2   z(fsdp_apply_gradients.<locals>.<listcomp>3  s!    888uQx1}888r   r_   )r#   rV   ri   r9   r   apply_gradientsr	   rA   rL   rankro   rl   sum_scatterrW   
all_gatherrE   rm   rd   r   )%rN   
parameters	optimizer
fsdp_groupdp_grouprG   rQ   r}   rY   r   r=   r>   r@   rF   groups	fsdp_rankgrad_slicesparam_slices	group_idx	arr_grouprh   	big_paramupdated_param_slicesnew_flatbig_gatheredsplit_sizessplit_indicesrv   spartsidx_in_grouprK   resultr   re   r   r?   s%                                    @@@@r   fsdp_apply_gradientsr      s   | 4r~2244J0DX]]___!LAAvv#3Ix#H#H Iy,,Y
CCYNN((J???i((Jz**K"/
";";D&%ay~HD%3EFFFA!!IKL )& 1 1 7 7	9>@@@@@i@@@q
 
 
 N&&
3G '    	I %'^%;%;I&h?S &< & &K	" NAAAAAyAAA
 
 
	 #,I"6Y I!1"
 "
 "
Y
 %44[,OO H )& 1 1 O O	9~00 +' 1 
 

 98888i888 	& 	&A1HC  %%%%}SbS'9BBB(33 	O 	OOL!OOT!WeL&9&A&A&)&L&LMNNNN	O H%%Fy  Mr   r   )NrM   N)NNrM   NN)	functoolsr   r   typingr   r   r   mlx.corecorer#   utilsr	   r
   r   r   layers.baser   r$   r+   rA   rL   rV   GroupintStreamrk   r   r   r.   r   r   <module>r      s   $ # # # # # # # * * * * * * * * * *       G G G G G G G G G G G G      !& !h ! ! ! !:# #v #8H#5 # # # #B' ' '  & -1'04	J. J.J.BN()J. J. #29-	J. J. J. J.Z" " " " #G G G G G Gr   