
    jN              
          d dl Z d dlmZ d dlmZmZmZ d dlmZ	 d dl
mZ d dlmZ d dlmZ d dlmZ ed             Zd	 Z	 d d
ededee	j        j                 fdZd Zd Zd Zddddedeeef         deeef         dee	j        j                 fdZddddededeeef         dee	j        j                 fdZ G d de          Z  G d de          Z! G d de          Z" G d de          Z#dS )!    N)	lru_cache)CallableOptionalUnion)Module)Linear)QuantizedLinear)tree_map_with_pathc                                                        dk    rd S t          j        d             }|j         fd            }|S )N   c                     | S N xs    c/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx/nn/layers/distributed.py<lambda>zsum_gradients.<locals>.<lambda>   s         c                     | S r   r   r   s    r   fzsum_gradients.<locals>.f   s    r   c                 F    t           j                            |          S )Ngroup)mxdistributedall_sum)r   dx_r   s      r   r   zsum_gradients.<locals>.f   s    ~%%b%666r   )sizer   custom_functionvjp)r   r   s   ` r   sum_gradientsr"      si    zz||q{   U7 7 7 7 U7 Hr   c                     t          |t                    st          |d         t                    rt          j        | ||          S | j        |         fd|D             }t          j        | ||          S )z:Equivalent to mx.split but allows for fractional segments.r   axisc                 4    g | ]}t          |z            S r   )int).0sNs     r   
<listcomp>z_split.<locals>.<listcomp>$   s#    ,,,as1q5zz,,,r   )
isinstancer'   r   splitshape)weightsegmentsr%   indicesr*   s       @r   _splitr2      s}    (C   5Jx{C$@$@ 5xt4444TA,,,,8,,,G8FG$////r   
parameterssharding_predicater   c                     |pt           j                                        }|                                |                                fd}t          ||           S )zReturns a new parameter tree with the weights sharded according to the
    sharding_predicate.

    The sharding predicate should return the sharding axis and optionally also
    the segments that comprise the weight.
    c           
      t   t          |t          j                  s|S  | |          }||S d d}t          |t                    r|n*t          |t                    r|\  }nt          d          t          j        t          j        fdt          ||          D                                 S )Nr   z;The sharding function should return int or tuple[int, list]c                 >    g | ]}t          |                   S r   )r2   )r(   partr*   r%   rs     r   r+   z-_shard.<locals>._shard_fn.<locals>.<listcomp>L   s*    UUUda&&q)UUUr   r$   )	r,   r   arrayr'   tuple
ValueError
contiguousconcatenater2   )pathr/   r)   r0   r%   r*   r9   r4   s       @r   	_shard_fnz_shard.<locals>._shard_fn7   s    &"(++ 	MtV,,9Ma 	DD5!! 	ND((M   }NUUUUUUfVXt6T6TUUU  
 
 	
r   )r   r   initr   rankr
   )r3   r4   r   r@   r*   r9   s    `  @@r   _shardrC   (   sn     *R^((**E

A

A
 
 
 
 
 
 
4 i444r   c                       fd}|S )zxSimple predicate to shard fully connected layers such that a common
    representation becomes a sharded representation.c                 j    |                      d          rdfS t          |j        dz
  d          fS )Nbias   r   )endswithmaxndimr?   r/   r0   s     r   r@   z"_all_to_sharded.<locals>._shard_fnX   s<    ==   	 x<6;?A&&00r   r   r0   r@   s   ` r   _all_to_shardedrN   T   s$    1 1 1 1 1
 r   c                       fd}|S )zxSimple predicate to shard fully connected layers such that a sharded
    representation becomes a common representation.c                 :    |                      d          rd S dfS )NrF   rG   )rI   rL   s     r   r@   z"_sharded_to_all.<locals>._shard_fnd   s&    ==   	48|r   r   rM   s   ` r   _sharded_to_allrQ   `   s$        
 r   c                 4    | dvrt          d| d          d S )N)all-to-shardedsharded-to-allzSharding type sharding=zB not supported, choose one of 'all-to-sharded' or 'sharded-to-all')r<   )shardings    r   _check_shardingrV   l   sB    ;;;E E E E
 
 	
 <;r   r   r0   r   modulerU   r0   c                   t          |t                    r3t          |           |dk    rt          |          nt	          |          }|                     t          |                                 ||                     dS )aD  Shard a module in-place by updating its parameter dictionary with the
    sharded parameter dictionary.

    The ``sharding`` argument can be any callable that given the path and the
    weight returns the sharding axis and optionally also the segments that
    comprise the unsharded weight. For instance if the weight is a fused QKV
    matrix the segments should be 3.

    .. note::
        The module doesn't change so in order for distributed communication to
        happen the module needs to natively support it and for it to be enabled.

    Args:
        module (mlx.nn.Module): The parameters of this module will be sharded
            in-place.
        sharding (str or callable): One of "all-to-sharded" and
            "sharded-to-all" or a callable that returns the sharding axis and
            segments.
        segments (int or list): The segments to use if ``sharding`` is a
            string. Default: ``1``.
        group (mlx.core.distributed.Group): The distributed group to shard
            across. If not set, the global group will be used. Default: ``None``.
    rS   N)r,   strrV   rN   rQ   updaterC   r3   )rX   rU   r0   r   s       r   shard_inplacer\   v   s    < (C   
!!! +++ H%%% ** 	
 MM&**,,h>>?????r   c                    t          |           t          j        t          j        t
          j        t          j        d} ||t          | t                    f         | ||          S )a  Create a new linear layer that has its parameters sharded and also
    performs distributed communication either in the forward or backward
    pass.

    .. note::
        Contrary to ``shard_inplace``, the original layer is not changed but a
        new layer is returned.

    Args:
        module (mlx.nn.Module): The linear layer to be sharded.
        sharding (str): One of "all-to-sharded" and
            "sharded-to-all" that defines the type of sharding to perform.
        segments (int or list): The segments to use. Default: ``1``.
        group (mlx.core.distributed.Group): The distributed group to shard
            across. If not set, the global group will be used. Default: ``None``.
    ))rS   T)rS   F)rT   T)rT   FrW   )	rV   AllToShardedLinearfrom_linearQuantizedAllToShardedLinearfrom_quantized_linearShardedToAllLinearQuantizedShardedToAllLinearr,   r   )rX   rU   r0   r   fnss        r   shard_linearre      sl    . H"4"@#>#T"4"@#>#T	 C 53xFF3334   r   c            
            e Zd ZdZ	 	 ddedededeej        j	                 f fdZ
d	efd
Zdej        d	ej        fdZeddddedeeef         deej        j	                 fd            Z xZS )r^   a  Each member of the group applies part of the affine transformation such
    that the result is sharded across the group.

    The gradients are automatically aggregated from each member of the group.

    Args:
        input_dims (int): The dimensionality of the input features
        output_dims (int): The dimensionality of the output features
        bias (bool, optional): If set to ``False`` the the layer will not use a
            bias. Default is ``True``.
        group (mx.distributed.Group, optional): The sharding will happen across
            this group. If not set then the global group is used. Default is
            ``None``.
    TN
input_dimsoutput_dimsrF   r   c                    t                                                       t          j        d|z            }|pt          j                                        | _        | j                                        }||z  dk    rt          d| d| d          t          j
                            | |||z  |f          | _        |r.t          j
                            | |||z  f          | _        d S d S )N      ?r    Cannot shard the output of size  across 	 devices.lowhighr.   super__init__mathsqrtr   r   rA   r   r   r<   randomuniformr/   rF   selfrg   rh   rF   r   scaler*   	__class__s          r   rs   zAllToShardedLinear.__init__   s    	 	#
*++3bn1133
JOO!O!!T;TTTTT   i''!#Z0 ( 
 

  		))F"a') *  DIII	 	r   returnc                 v    | j         j        \  }}| j                                        }||z  }d| d| dd| v  S Ninput_dims=, output_dims=, bias=rF   )r/   r.   r   r   )ry   out_dimsin_dimsr*   s       r   _extra_reprzAllToShardedLinear._extra_repr   sM     K-'JOOAUWUUHUUVt^UUUr   r   c                      t          | j                  |          }d| v r(t          j        | d         || d         j                  }n|| d         j        z  }|S )NrF   r/   )r"   r   r   addmmTry   r   s     r   __call__zAllToShardedLinear.__call__   s\    %M$*%%a(( T>>fq$x.*:;;AADN$$Ar   r   rW   linear_layerr0   c                "   |pt           j                                        }|j        j        \  }} | ||t          |d          |          }|                    t          |                                t          |          |                     |S NrF   )
r   r   rA   r/   r.   hasattrr[   rC   r3   rN   clsr   r0   r   rh   rg   sls          r   r_   zAllToShardedLinear.from_linear       .,,.."."5";ZS[',*G*GOO
		&0022OH4M4MuUUVVV	r   TN__name__
__module____qualname____doc__r'   boolr   r   r   Grouprs   rZ   r   r:   r   classmethodr   r   listr_   __classcell__r{   s   @r   r^   r^      s+        & 04   	
 ,-     >VS V V V V	"( 	rx 	 	 	 	 
 &'04   T	"	
 ,-   [    r   r^   c            
            e Zd ZdZ	 	 ddedededeej        j	                 f fdZ
d	efd
Zdej        d	ej        fdZeddddedeeef         deej        j	                 fd            Z xZS )rb   a   Each member of the group applies part of the affine transformation and
    then aggregates the results.

    All nodes will have the same exact result after this layer.

    :class:`ShardedToAllLinear` provides a classmethod :meth:`from_linear` to
    convert linear layers to sharded :obj:`ShardedToAllLinear` layers.

    Args:
        input_dims (int): The dimensionality of the input features
        output_dims (int): The dimensionality of the output features
        bias (bool, optional): If set to ``False`` the the layer will not use a
            bias. Default is ``True``.
        group (mx.distributed.Group, optional): The sharding will happen across
            this group. If not set then the global group is used. Default is
            ``None``.
    TNrg   rh   rF   r   c                    t                                                       t          j        d|z            }|pt          j                                        | _        | j                                        }||z  dk    rt          d| d| d          t          j
                            | ||||z  f          | _        |r+t          j
                            | ||f          | _        d S d S )Nrj   r   The input of size  cannot be sharded across rm   rn   rq   rx   s          r   rs   zShardedToAllLinear.__init__%  s
    	 	#
*++3bn1133
JOONq  WZWW1WWW   i''
a0 ( 
 

  		))F"n *  DIII	 	r   r|   c                 v    | j                                         }| j        j        \  }}||z  }d| d| dd| v  S r~   )r   r   r/   r.   )ry   r*   r   r   s       r   r   zShardedToAllLinear._extra_reprD  sM    JOO K-'1UWUUHUUVt^UUUr   r   c                     || d         j         z  }t          j                            || j                  }d| v r|| d         z   }|S )Nr/   r   rF   )r   r   r   r   r   r   s     r   r   zShardedToAllLinear.__call__J  sK    X  N""1DJ"77T>>DL Ar   r   rW   r   r0   c                "   |pt           j                                        }|j        j        \  }} | ||t          |d          |          }|                    t          |                                t          |          |                     |S r   )
r   r   rA   r/   r.   r   r[   rC   r3   rQ   r   s          r   r_   zShardedToAllLinear.from_linearT  r   r   r   r   r   s   @r   rb   rb     s+        , 04   	
 ,-     >VS V V V V"( rx     
 &'04   T	"	
 ,-   [    r   rb   c                       e Zd ZdZ	 	 	 	 	 ddeded	ed
edededeej	        j
                 f fdZ fdZdefdZdej        dej        fdZeddddedeeef         deej	        j
                 fd            Z xZS )r`   a  Each member of the group applies part of the affine transformation with
    a quantized matrix such that the result is sharded across the group.

    It is the quantized equivalent of :class:`mlx.nn.AllToShardedLinear`.
    Similar to :class:`mlx.nn.QuantizedLinear` its parameters are frozen and
    will not be included in any gradient computation.

    Args:
        input_dims (int): The dimensionality of the input features.
        output_dims (int): The dimensionality of the output features.
        bias (bool, optional): If set to ``False`` then the layer will not use
            a bias. Default: ``True``.
        group_size (int, optional): The group size to use for the quantized
            weight. See :func:`~mlx.core.quantize`. Default: ``64``.
        bits (int, optional): The bit width to use for the quantized weight.
            See :func:`~mlx.core.quantize`. Default: ``4``.
        mode (str, optional): The quantization method to use (see
            :func:`~mlx.core.quantize`). Default: ``"affine"``.
        group (mx.distributed.Group, optional): The sharding will happen across
            this group. If not set then the global group is used. Default is
            ``None``.
    T@      affineNrg   rh   rF   
group_sizebitsmoder   c                 |   t                                                       || _        || _        || _        t          j        d|z            }|pt          j        	                                | _
        | j
                                        }	||	z  dk    rt          d| d|	 d          t          j                            | |||	z  |f          }
t          j        |
|||          ^| _        | _        }|r|d         nd | _        |rt          j        ||	z  f          | _        |                                  d S )Nrj   r   rk   rl   rm   rn   r   rr   rs   r   r   r   rt   ru   r   r   rA   r   r   r<   rv   rw   quantizer/   scalesbiaseszerosrF   freezery   rg   rh   rF   r   r   r   r   rz   r*   r/   r   r{   s               r   rs   z$QuantizedAllToShardedLinear.__init__}  sU    	 %		 	#
*++3bn1133
JOO!O!!T;TTTTT   ""!#Z0 # 
 

 -/KJ4-
 -
 -
)T[6 $*3fQiit  	6+"2!455DI 	r   c                 f     t                      j        |i | |                     d           dS zlWrap unfreeze so that we unfreeze any layers we might contain but
        our parameters will remain frozen.F)recurseNrr   unfreezer   ry   argskwargsr{   s      r   r   z$QuantizedAllToShardedLinear.unfreeze  ;     	$)&)))E"""""r   r|   c                     | j         j        \  }}|dz  | j        z  }|| j                                        z  }d| d| dd| v  d| j         d| j         d| j         S 	N    r   r   r   rF   z, group_size=z, bits=z, mode=r/   r.   r   r   r   r   r   ry   r   r   s      r   r   z'QuantizedAllToShardedLinear._extra_repr  s     K-'R<DI-DJOO%%%P' P P P P&D. P P/P P26)P PDHIP P	
r   r   c           
           t          | j                  |          }t          j        || d         | d         |                     d          d| j        | j        | j                  }d| v r|| d         z   }|S )Nr/   r   r   Tr   r   	transposer   r   r   rF   )r"   r   r   quantized_matmulgetr   r   r   r   s     r   r   z$QuantizedAllToShardedLinear.__call__  s    %M$*%%a((N>88H%%	
 	
 	
 T>>DL Ar   r   rW   quantized_linear_layerr0   c                v   |pt           j                                        }|j        j        \  }}|dz  |j        z  } | ||t          |d          |j        |j        t          |dd          |          }|	                    t          |                                t          |          |                     |S Nr   rF   r   r   )r   r   r   r   )r   r   rA   r/   r.   r   r   r   getattrr[   rC   r3   rN   r   r   r0   r   rh   rg   r   s          r   ra   z1QuantizedAllToShardedLinear.from_quantized_linear       .,,.."8"?"EZ 2o*@*EE
S*F33-8',/BB
 
 
 			&1133)) 	
 	
 	
 	r   Tr   r   r   Nr   r   r   r   r'   r   rZ   r   r   r   r   rs   r   r   r:   r   r   r   r   r   ra   r   r   s   @r   r`   r`   e  sg        6 04* ** * 	*
 * * * ,-* * * * * *X# # # # #
S 
 
 
 
"( rx    $ 
 &'04   & T	"	
 ,-   [    r   r`   c                       e Zd ZdZ	 	 	 	 	 ddeded	ed
edededeej	        j
                 f fdZ fdZdefdZdej        dej        fdZeddddedeeef         deej	        j
                 fd            Z xZS )rc   a  Each member of the group applies part of the affine transformation using
    the quantized matrix and then aggregates the results.

    All nodes will have the same exact result after this layer.

    It is the quantized equivalent of :class:`mlx.nn.ShardedToAllLinear`.
    Similar to :class:`mlx.nn.QuantizedLinear` its parameters are frozen and
    will not be included in any gradient computation.

    Args:
        input_dims (int): The dimensionality of the input features.
        output_dims (int): The dimensionality of the output features.
        bias (bool, optional): If set to ``False`` then the layer will not use
            a bias. Default: ``True``.
        group_size (int, optional): The group size to use for the quantized
            weight. See :func:`~mlx.core.quantize`. Default: ``64``.
        bits (int, optional): The bit width to use for the quantized weight.
            See :func:`~mlx.core.quantize`. Default: ``4``.
        mode (str, optional): The quantization method to use (see
            :func:`~mlx.core.quantize`). Default: ``"affine"``.
        group (mx.distributed.Group, optional): The sharding will happen across
            this group. If not set then the global group is used. Default is
            ``None``.
    Tr   r   r   Nrg   rh   rF   r   r   r   r   c                 v   t                                                       || _        || _        || _        t          j        d|z            }|pt          j        	                                | _
        | j
                                        }	||	z  dk    rt          d| d|	 d          t          j                            | ||||	z  f          }
t          j        |
|||          ^| _        | _        }|r|d         nd | _        |rt          j        |f          | _        |                                  d S )Nrj   r   r   r   rm   rn   r   r   r   s               r   rs   z$QuantizedShardedToAllLinear.__init__  sN    	 %		 	#
*++3bn1133
JOONq  WZWW1WWW   ""
a0 # 
 

 -/KJ4-
 -
 -
)T[6 $*3fQiit  	1+00DI 	r   c                 f     t                      j        |i | |                     d           dS r   r   r   s      r   r   z$QuantizedShardedToAllLinear.unfreeze0  r   r   r|   c                     | j         j        \  }}|dz  | j        z  | j                                        z  }d| d| dd| v  d| j         d| j         d| j         S r   r   r   s      r   r   z'QuantizedShardedToAllLinear._extra_repr6  s     K-'R<DI-
0A0AAP' P P P P&D. P P/P P26)P PDHIP P	
r   r   c           
         t          j        || d         | d         |                     d          d| j        | j        | j                  }t           j                            || j                  }d| v r|| d         z   }|S )Nr/   r   r   Tr   r   rF   )	r   r   r   r   r   r   r   r   r   r   s     r   r   z$QuantizedShardedToAllLinear.__call__>  s    N>88H%%	
 	
 	
 N""1DJ"77T>>DL Ar   r   rW   r   r0   c                v   |pt           j                                        }|j        j        \  }}|dz  |j        z  } | ||t          |d          |j        |j        t          |dd          |          }|	                    t          |                                t          |          |                     |S r   )r   r   rA   r/   r.   r   r   r   r   r[   rC   r3   rQ   r   s          r   ra   z1QuantizedShardedToAllLinear.from_quantized_linearN  r   r   r   r   r   s   @r   rc   rc     sg        : 04* ** * 	*
 * * * ,-* * * * * *X# # # # #
S 
 
 
 
"( rx      
 &'04   & T	"	
 ,-   [    r   rc   r   )$rt   	functoolsr   typingr   r   r   mlx.corecorer   mlx.nn.layers.baser   mlx.nn.layers.linearr   mlx.nn.layers.quantizedr	   	mlx.utilsr
   r"   r2   dictr   r   rC   rN   rQ   rV   rZ   r'   r   r\   re   r^   rb   r`   rc   r   r   r   <module>r      s          , , , , , , , , , ,       % % % % % % ' ' ' ' ' ' 3 3 3 3 3 3 ( ( ( ( ( (   0 0 0 -1)5 )5)5 )5 BN())5 )5 )5 )5X	 	 		 	 	
 
 
 "#,0%@ %@ %@%@CM"%@ CI	%@
 BN()%@ %@ %@ %@X "#,0        CI	 
 BN()       FN N N N N N N NbP P P P P P P PfB B B B B& B B BJA A A A A& A A A A Ar   