
    zjY`                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZmZmZ d dlm Z  erd dl!m"Z"m#Z# d dl$m%Z%m&Z&  G d de          Z'g Z( G d d          Z)d Z*d Z+d Z,d Z-d Z.d Z/d Z0d Z1 G d d          Z2	 	 	 	 d-d.d,Z3dS )/    )annotationsN)TYPE_CHECKINGAnyLiteral	TypedDict)core)
get_device)_get_trainers_numget_cluster_and_pod)use_paddlecloud)get_cluster_from_args)
DeviceModeblock_windows_and_macoscheck_backend)_prepare_trainer_env_print_argumentsget_host_name_ip)	set_flags)CallableIterable)NotRequiredUnpackc                  8    e Zd ZU ded<   ded<   ded<   ded<   dS )	_SpawnOptionsz3NotRequired[Literal['spawn', 'fork', 'forkserver']]start_methodzNotRequired[str | None]gpusxpuszNotRequired[str]ipsN)__name__
__module____qualname____annotations__     h/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/paddle/distributed/spawn.pyr   r   3   sB         IIII%%%%%%%%r$   r   c                      e Zd Zd ZdS )ParallelEnvArgsc                Z    d | _         d | _        d | _        d | _        d| _        d | _        d S )NT)cluster_node_ipsnode_ipr   started_portprint_configselected_devices)selfs    r%   __init__zParallelEnvArgs.__init__>   sA     $   $ ! ! !%r$   N)r   r    r!   r/   r#   r$   r%   r'   r'   =   s#        % % % % %r$   r'   c                    g d}g d}| D ]<}||vr6||v rt          j        d| dt                     )t          d| d          =d S )N)r   r   r   r   r,   backend)r-   r+   r)   r*   r   zThe config option (z|) of `paddle.distributed.spawn` is deprecated. Please use the latest config options stated in the `spawn` API documentation.z1) of `paddle.distributed.spawn` is not supported.)warningswarnDeprecationWarning
ValueError)optionssupported_optionsdeprecated_optionskeys       r%   _options_valid_checkr:   V   s           '''(((d# d d d&    !`#```   ( r$   c                 R   t                      } | t          j                    v r-t          j        |                     d          d                   S d| v rt          j                    S d| v rt          j                    S d| v rt          j                    S t          d|  d          )N:r   gpuxpucpuI`paddle.distributed.spawn` does not support parallel training on device `` now.)
r	   r   get_available_custom_deviceget_custom_device_countsplitget_cuda_device_countget_xpu_device_countmultiprocessing	cpu_countRuntimeErrordevices    r%   _get_default_nprocsrL   u   s    \\F13333+FLL,=,=a,@AAA	&)+++	&(***	&(***fX^fff
 
 	
r$   c                     t                      } | t          j                    v rdS d| v rdS d| v rdS d| v rdS t          d|  d	          )
Nxcclr=   ncclr>   bkclr?   gloor@   rA   )r	   r   rB   rI   rJ   s    r%   _get_default_backendrR      so    \\F13333v	&v	&v	&vfX^fff
 
 	
r$   c                    d }d |                      d          D             }t          |          dk    r	|d         }nt                      \  }}|S )Nc                6    g | ]}|                                 S r#   )strip.0xs     r%   
<listcomp>z _get_node_ip.<locals>.<listcomp>   s     222a		222r$   ,   r   )rD   lenr   )r   r*   node_ips_s       r%   _get_node_ipr_      sT    G22399S>>222H
8}}1+%''
7Nr$   c           	        d|vs|d         dk    rt                      |d<   t          |d                    t          |d                    g }t                      }|                    dd           |_        |j        )|                    dd           |_        |j        d|_        |d         dk    r|                    dd           |_        |j        |                    dd           |_        t          j        d	d           }||d
k    r+d t          t          j                              D             n|                    d          |j        lt                    | k     r#t          dt                     d|  d          d                    fdt          d|           D                       |_        n|j                            d          }t          |          | k    r#t!          dt          |           d|  d          |D ]<}|vr6t!          d                    |d                                                  =nR|d         dk    r|                    dd           |_        |j        |                    dd           |_        t          j        dd           }||d
k    r+d t          t          j                              D             n|                    d          |j        lt                    | k     r#t          dt                     d|  d          d                    fdt          d|           D                       |_        n8|j                            d          }t          |          | k    r#t!          dt          |           d|  d          |D ]<}|vr6t!          d                    |d                                                  =n|d         dk    rt'          j        d           d |_        d |_        |j        |_        |                    d!d           
J d"            t          |j                            d                    d#k    s
J d$            t/                      d#k    s
J d%            n|d         d&k    rd |_        t          j                    d         }t          j        d'| d(d           }||d
k    r,d) t          t          j        |                    D             n|                    d          t                    | k     r&t          dt                     d|  d*| d+          d                    fd,t          d|           D                       |_        |                    d-d           |_        |j        t7          |j                  |_        |                    d.d           |_        |                    d!d           |_        |j        t;                      |_        |d         dk    r<t=          t          d|                     }t?          |t@          j!        |          \  }	}
ntE          |          \  }	}
|
j#        D ],}|$                    tK          |	||d                              -|                    d/d0          |_&        |j&        rtO          |           |S )1Nr1   autor   r)   z	127.0.0.1rO   r   r-   CUDA_VISIBLE_DEVICES c                ,    g | ]}t          |          S r#   strrV   s     r%   rY   z,_get_subprocess_env_list.<locals>.<listcomp>   +          A     r$   rZ   zthe number of visible devices(z-) is less than the number of spawn processes(z), please ensure that the correct `nprocs` argument is passed or the environment variable `CUDA_VISIBLE_DEVICES` is correctly configured.c                :    g | ]}t          |                   S r#   re   rW   rX   env_devices_lists     r%   rY   z,_get_subprocess_env_list.<locals>.<listcomp>   '    DDDa%a())DDDr$   r   zThe number of selected devices(z0) is not equal to the number of spawn processes(zK), please ensure that the correct `nprocs` and `gpus` arguments are passed.zCThe selected gpu card {} cannot found in CUDA_VISIBLE_DEVICES ({}).rP   r   XPU_VISIBLE_DEVICESc                ,    g | ]}t          |          S r#   re   rV   s     r%   rY   z,_get_subprocess_env_list.<locals>.<listcomp>   rg   r$   z), please ensure that the correct `nprocs` argument is passed or the environment variable `XPU_VISIBLE_DEVICES` is correctly configured.c                :    g | ]}t          |                   S r#   re   ri   s     r%   rY   z,_get_subprocess_env_list.<locals>.<listcomp>   rk   r$   zK), please ensure that the correct `nprocs` and `xpus` arguments are passed.zBThe selected xpu card {} cannot found in XPU_VISIBLE_DEVICES ({}).rQ   zYour model will be trained under CPUONLY mode by using GLOO,because CPUPlace is specified manually or your installed PaddlePaddle only support CPU Device.Tr   z.CPUONLY spawn doesn't support use paddle cloudr[   zJCPUONLY spawn only support single trainer, that is len(ips)=1, but got %s.z+CPUONLY spawn doesn't support multi-trainerrN   FLAGS_selected_sc                ,    g | ]}t          |          S r#   re   rV   s     r%   rY   z,_get_subprocess_env_list.<locals>.<listcomp>#  s.           A     r$   zj), please ensure that the correct `nprocs` argument is passed or the environment variable `FLAGS_selected_zs` is correctly configured.c                :    g | ]}t          |                   S r#   re   ri   s     r%   rY   z,_get_subprocess_env_list.<locals>.<listcomp>2  s'    @@@!S!!$%%@@@r$   r*   r+   r,   F)(rR   r   r   r'   getr)   r-   osgetenvranger   rE   rD   r\   rI   joinr5   formatrF   r2   r3   paddle_cpuonlyr   r
   get_all_custom_device_typerC   r*   r_   r+   r   listr   r   CPUr   trainersappendr   r,   r   )nprocsr6   processes_env_listargsenv_devicesselected_device_listcard_idcustom_device_namedevices_per_procclusterpodtrainerrj   s               @r%   _get_subprocess_env_listr      sU    79#5#?#?133	')$%%%GI.///  D $KKt44D$ ',> E E ($/D! yV## 'FD 9 9 ($+KK0BD$I$ID!i 6==+"3"3    %d&@&B&B C C       +0055 (#$$v--"FS9I5J5J F F*0F F F   %(HHDDDD5F3C3CDDD% %D!! $(#8#>#>s#C#C '((F22 Hc:N6O6O H H5;H H H  
 0  "222$55;V#SXX.>%?%?6 6   3 
	v	%	% 'FD 9 9 ($+KK0BD$I$ID!i 5t<<+"3"3    %d&?&A&A B B       +0055 (#$$v--"ES9I5J5J E E*0E E E   %(HHDDDD5F3C3CDDD% %D!! $(#8#>#>s#C#C '((F22 Hc:N6O6O H H5;H H H  
 0  "222$44:F#SXX.>%?%?5 5   3 
	v	%	%m	
 	
 	
 # $({{,d33;;< <;; 4(..s3344999X :99 !""a'''9 (''' 
	v	%	% $!<>>qAi G2D G G GNN+"3"3   t;<NOOPP     
  +0055  6))S5E1F1F S S&,S S $6S S S   !$@@@@uQ/?/?@@@!
 !

 ;;y$//DL|#D$9::ND99D";;'8$??D#.00 yV##a 0 011,*."2
 
 +400 < 
 
!! '793EFF	
 	
 	
 	

  NE::D r$   c                     t           j                            dd            t           j                            dd            d S )N
http_proxyhttps_proxy)rt   environpopr#   r$   r%   _remove_risky_envr   W  s6     JNN<&&&JNN=$'''''r$   c                    |dk    rt          d| d         i           n|dk    rt          d| d         i           n	 | D ]}| |         t          j        |<   d S )NrO   FLAGS_selected_gpusrP   FLAGS_selected_xpus)r   rt   r   )env_dictr1   var_names      r%   _set_trainer_envr   ^  s     &((3H*IJKKKK	F		((3H*IJKKKK
 	 2 2'1
82 2r$   c                0   	 t                       t          ||            | | }|                    |           d S # t          $ r Y d S t          $ rC dd l}|                    |                                           t          j        d           Y d S w xY w)Nr   r[   )	r   r   putKeyboardInterrupt	Exception	traceback
format_excsysexit)funcr   error_queuereturn_queuer   r1   resultr   s           r%   _func_wrapperr   v  s    7+++t           	,,..///	s   8< 
B	ABBc                  "    e Zd Zd ZddZd ZdS )MultiprocessContextc                l    || _         || _        || _        d t          |          D             | _        d S )Nc                $    i | ]\  }}|j         |S r#   )sentinel)rW   indexprocesss      r%   
<dictcomp>z0MultiprocessContext.__init__.<locals>.<dictcomp>  s.     
 
 
(6wGe
 
 
r$   )error_queuesreturn_queues	processes	enumerate	sentinels)r.   r   r   r   s       r%   r/   zMultiprocessContext.__init__  sG    ( +"
 
:CI:N:N
 
 
r$   Nc                6   t          | j                  dk    rdS t          j                            | j                                        |          }d }|D ]L}| j                            |          }| j        |         }|                                 |j	        dk    r|} nM|t          | j                  dk    S | j        D ]>}|
                                r|                                 |                                 ?|                     |           d S )Nr   T)timeout)r\   r   rG   
connectionwaitkeysr   r   rw   exitcodeis_alive	terminate_throw_exception)r.   r   readyerror_indexr   r   r   s          r%   rw   zMultiprocessContext.join  s*   t~!##4*//N!!7 0 
 
  	 	HN&&x00EnU+GLLNNN1$$# % t~&&!++~ 	 	G!! $!!###LLNNNNk*****r$   c                n   | j         |                                         r^| j        |         j        }|dk     r0t	          j        |           j        }t          d| d| d          t          d| d| d          | j         |                                         }d| d}||z  }t          |          )Nr   zProcess z terminated with signal .z terminated with exit code z9

----------------------------------------------
Process zV terminated with the following error:
----------------------------------------------

)	r   emptyr   r   signalSignalsnamer   rs   )r.   r   r   r   original_tracemsgs         r%   r   z$MultiprocessContext._throw_exception  s    [)//11 
	~k2;H!||~xi005K{KKDKKK    R{RRxRRR   *;7;;==A"A A A 	
 	~nnr$   )N)r   r    r!   r/   rw   r   r#   r$   r%   r   r     sF        
 
 
+ + + +6    r$   r   r#   TFr   Callable[..., None]r   Iterable[Any]r   intrw   booldaemonr6   Unpack[_SpawnOptions]returnc                   t          |           |dk    rt                      }t          ||          }|                    dd          }|d}t	          j        |          }g }	g }
g }t          |          D ]}|                                }|                                }|                    t          | |||||         |d         f          }||_
        |                                 |	                    |           |
                    |           |                    |           t          ||	|
          }|s|S |                                s	 |                                |S )a6  
    Start multiple processes with ``spawn`` method for parallel training.

    .. note::
        ``spawn`` now only supports GPU or XPU collective mode. The collective mode
        of GPU and XPU cannot be started at the same time, so the option `gpus` and
        `xpus` cannot be configured at the same time.

    Args:
        func (function): The target function is called by spawned process.
            This function need to be able to pickled, so it must be defined
            at the top level of a module.
        args (list|tuple, optional): Arguments passed to ``func``.
        nprocs (int, optional): Number of processed to start. Default: -1.
            when nprocs is -1, the available device will be obtained from
            the environment variable when the model is executed: If use GPU,
            the currently available device ID is obtained from the environment
            variable CUDA_VISIBLE_DEVICES; If use XPU, the currently available
            device ID is obtained from the environment variable XPU_VISIBLE_DEVICES.
        join (bool, optional): Perform a blocking join on all spawned processes.
            Default: True.
        daemon (bool, optional): The spawned processes' daemon flag. Default: False.
        **options(dict, optional): Other initial parallel execution environment
            configuration options. The following options are currently supported:
            (1) start_method (string): the way to start a process.
            The start method can be ``spawn`` , ``fork`` , ``forkserver`` .
            Because the CUDA runtime does not support the ``fork`` start method,
            when use CUDA in subprocesses, we should start process by ``spawn``
            or ``forkserver`` method. Default: "spawn" ;
            (2) gpus (string): The training process will run on the
            selected gpus, such as "0,1,2,3". Default: None;
            (3) xpus (string): The training process will run on the
            selected xpus, such as "0,1,2,3". Default: None;
            (5) ips (string): Paddle cluster nodes ips, such as
            "192.168.0.16,192.168.0.17". Default: "127.0.0.1" .

    Returns:
        ``MultiprocessContext`` object, it hold the spawned processes.

    Examples:
        .. code-block:: python

            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
            >>> import paddle
            >>> import paddle.nn as nn
            >>> import paddle.optimizer as opt
            >>> import paddle.distributed as dist

            >>> class LinearNet(nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...         self._linear1 = nn.Linear(10, 10)
            ...         self._linear2 = nn.Linear(10, 1)
            ...     def forward(self, x):
            ...         return self._linear2(self._linear1(x))

            >>> def train(print_result=False):
            ...     # 1. initialize parallel environment
            ...     group = dist.init_parallel_env()
            ...     process_group = group.process_group if group else None
            ...     # 2. create data parallel layer & optimizer
            ...     layer = LinearNet()
            ...     dp_layer = paddle.DataParallel(layer, group = process_group)  # type: ignore[arg-type]
            ...     loss_fn = nn.MSELoss()
            ...     adam = opt.Adam(
            ...         learning_rate=0.001, parameters=dp_layer.parameters())
            ...     # 3. run layer
            ...     inputs = paddle.randn([10, 10], 'float32')
            ...     outputs = dp_layer(inputs)
            ...     labels = paddle.randn([10, 1], 'float32')
            ...     loss = loss_fn(outputs, labels)
            ...     if print_result is True:
            ...         print("loss:", loss.numpy())
            ...     loss.backward()
            ...     adam.step()
            ...     adam.clear_grad()

            >>> # Usage 1: only pass function.
            >>> # If your training method no need any argument, and
            >>> # use all visible devices for parallel training.
            >>> if __name__ == '__main__':
            ...     dist.spawn(train)

            >>> # Usage 2: pass function and arguments.
            >>> # If your training method need some arguments, and
            >>> # use all visible devices for parallel training.
            >>> if __name__ == '__main__':
            ...     dist.spawn(train, args=(True,))

            >>> # Usage 3: pass function, arguments and nprocs.
            >>> # If your training method need some arguments, and
            >>> # only use part of visible devices for parallel training.
            >>> # If your machine hold 8 cards {0,1,2,3,4,5,6,7},
            >>> # this case will use cards {0,1}; If you set
            >>> # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use
            >>> # cards {4,5}
            >>> if __name__ == '__main__':
            ...     dist.spawn(train, args=(True,), nprocs=2)

            >>> # Usage 4: pass function, arguments, nprocs and gpus.
            >>> # If your training method need some arguments, and
            >>> # only use part of visible devices for parallel training,
            >>> # but you can't set your machine's environment variable
            >>> # CUDA_VISIBLE_DEVICES, such as it is None or all cards
            >>> # {0,1,2,3,4,5,6,7}, you can pass `gpus` to
            >>> # select the GPU cards you want to use. For example,
            >>> # this case will use cards {4,5} if your machine hold 8 cards.
            >>> if __name__ == '__main__':
            ...     dist.spawn(train, args=(True,), nprocs=2, gpus='4,5')

    r   r   Nspawnr1   )targetr   )r:   rL   r   rs   rG   get_contextrv   SimpleQueueProcessr   r   startr~   r   rw   )r   r   r   rw   r   r6   procs_env_listr   mpr   r   r   ir   r   r   contexts                    r%   r   r     s   r !!! ||$&& .fg>>N ;;~t44L		$\	2	2BLMI6]] " "nn&&~~''** q!	"  

 

  K(((\***!!!!!)\=IIG  llnn  llnn  Nr$   )r#   r   TF)r   r   r   r   r   r   rw   r   r   r   r6   r   r   r   )4
__future__r   rG   rt   r   r   r2   typingr   r   r   r   paddle.baser   paddle.devicer	   paddle.distributed.cloud_utilsr
   r   $paddle.distributed.fleet.cloud_utilsr   paddle.distributed.fleet.launchr   %paddle.distributed.fleet.launch_utilsr   r   r   %paddle.distributed.utils.launch_utilsr   r   r   paddle.frameworkr   collections.abcr   r   typing_extensionsr   r   r   __all__r'   r:   rL   rR   r_   r   r   r   r   r   r   r#   r$   r%   <module>r      s   # " " " " "     				  



  9 9 9 9 9 9 9 9 9 9 9 9       $ $ $ $ $ $        A @ @ @ @ @ A A A A A A         
         
 ' & & & & & 	2222222255555555    	    % % % % % % % %2  >
 
 
 
 
 
   u u up( ( (2 2 20  $= = = = = = = =D o o o o o o or$   