
    vjH                     ~    d dl Z d dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlmZ d Zd Z G d d	e          ZdS )
    N)mpu)_flatten_dense_tensors_unflatten_dense_tensors)Variable)Modulec                       fd}|S )Nc                 R    t           j        j                            |           S N)meanstdtorchnninitnormal_tensorr   r   s    p/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/modelscope/utils/nlp/distributed.pyinit_z!normal_init_method.<locals>.init_   "    x}$$V$C$@@@     )r   r   r   s   `` r   normal_init_methodr      s/    A A A A A A Lr   c                 J     t          j        d|z            z   fd}|S )z3Init method based on N(0, sigma/sqrt(2*num_layers).g       @c                 R    t           j        j                            |           S r
   r   r   s    r   r   z!scaled_init_method.<locals>.init_&   r   r   )mathsqrt)r   r   
num_layersr   s   ``  r   scaled_init_methodr   "   sH    
	#
*++
+CA A A A A A Lr   c                   4     e Zd Z fdZd Zd	dZd
dZ xZS )DistributedDataParallelc                 >    t          t                                                      t          j        t          j        j        k    rdnd _        | _        t          j
                     _        t          j                    } j                                        D ]2}t          j        |          rt          j        || j                   3	 	 	 d fd	g  _        g  _        t'           j                                                  D ]}fd} _        d S )NTFgroupc                 H   j         rd_         i }j                                        D ]P\  }}|j        rD|j        =|j                                        }||vrg ||<   ||                             |           Qj        r*t          j
        j        |v rt          dd           d_        |D ]}||         }d |D             }t          |          }	|r|	                                }	|s| s|	t          j        j                  z  }	t          j        |	j                   t          j
                                         |s| r|	t          j        j                  z  }	t)          |t+          |	|                    D ]\  }
}|
                    |           d S d S )NFzEWARNING: gloo dist backend for half parameters may be extremely slow.z7It is recommended to use the NCCL backend in this case.c                 &    g | ]}|j         j        S r   )graddata).0params     r   
<listcomp>zNDistributedDataParallel.__init__.<locals>.allreduce_params.<locals>.<listcomp>N   s    AAAUZ_AAAr   r#   )needs_reductionmodulenamed_parametersrequires_gradr'   r(   typeappendwarn_on_halfr   cuda
HalfTensorprintr   floatdistget_world_sizedata_parallel_group
all_reducesynchronizezipr   copy_)reduce_afterno_scalefp32_allreducebucketsnamer*   tpbucketgrads	coalescedbufsyncedselfs               r   allreduce_paramsz:DistributedDataParallel.__init__.<locals>.allreduce_params9   s	    $  *',$#';#?#?#A#A 2 2KD%* 2uz/E#joo//W,,*,GBK**5111$ 2z,77cU   -2)! * *B$R[FAA&AAAE 6u = =I% 6$-OO$5$5	# <L <!T%8"&":&< &< &< <	OIT5MNNNNJ**,,,# < <!T%8"&":&< &< &< <	'*!#;Iu#M#M(O (O * *V		&))))*= *  * * *r   c                  F    t           j                                       d S N)r   _execution_enginequeue_callback)unusedrJ   s    r   allreduce_hookz8DistributedDataParallel.__init__.<locals>.allreduce_hookb   s!    *99:JKKKKKr   )TFF)superr!   __init__r7   _backenddist_backendGLOOr2   r-   r   get_data_parallel_groupr9   get_tensor_model_parallel_rank
parametersr   	is_tensor	broadcasthook_handleshookslistrJ   )rI   r-   src_rankpr*   rP   rJ   	__class__s   `     @r   rR   z DistributedDataParallel.__init__.   sH   %t,,55777$(MT5F5K$K$KDDQV#&#>#@#@ 577'')) 	L 	LAq!! Lq($2JKKKK*.&+,1#	* #	* #	* #	* #	* #	*J 
$+002233 	L 	LEL L L L L L !1r   c                 *    d| _          | j        |i |S )NT)r,   r-   )rI   inputskwargss      r   forwardzDistributedDataParallel.forwardg   s!    #t{F-f---r   N Fc                 >    | j                             |||          }|S rL   )r-   
state_dict)rI   destinationprefix	keep_varssds        r   rg   z"DistributedDataParallel.state_dictk   s     [##KCC	r   Tc                 >    | j                             ||           d S )N)strict)r-   load_state_dict)rI   rg   rm   s      r   rn   z'DistributedDataParallel.load_state_dictp   s#    ##Jv#>>>>>r   )Nre   F)T)__name__
__module____qualname__rR   rd   rg   rn   __classcell__)r`   s   @r   r!   r!   ,   sp        71 71 71 71 71r. . .   
? ? ? ? ? ? ? ?r   r!   )r   r   torch.distributeddistributedr7   megatron_utilr   torch._utilsr   r   torch.autogradr   torch.nn.modulesr   r   r   r!   r   r   r   <module>ry      s                         I I I I I I I I # # # # # # # # # # # #    E? E? E? E? E?f E? E? E? E? E?r   