diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py index e6febff505e5248e6fc908c62293db2461b3eb32..83a5e18714dff8c75768e4bd4f46d898983b70f0 100644 --- a/python/paddle/distributed/launch/main.py +++ b/python/paddle/distributed/launch/main.py @@ -40,9 +40,9 @@ def launch(): - ``--rank``: The rank of the node, can be auto assigned by master. Default ``--rank=-1``. - - ``--log_level``: The log levl to set for logging.setLevel. Default ``--log_level=INFO``. + - ``--log_level``: The log level to set for logging.setLevel which can be CRITICAL/ERROR/WARNING/INFO/DEBUG/NOTSET, case insensitive. The rank 0 log will not print in the terminal by default, while you can enable it by adding --log_level=debug. Default ``--log_level=INFO``. - - ``--nnodes``: The number of nodes for a distributed job, it can be a range in elastic mode, e.g., ``--nnnodes=2:3``. Default ``--nnodes=1``. + - ``--nnodes``: The number of nodes for a distributed job, it can be a range in elastic mode, e.g., ``--nnodes=2:3``. Default ``--nnodes=1``. - ``--nproc_per_node``: The number of processes to launch on a node. In gpu training, it should be less or equal to the gpus number of you system. e.g., ``--nproc_per_node=8`` @@ -93,9 +93,11 @@ def launch(): Returns: - ``None`` + - ``None`` Examples 0 (master, ip/port auto detection): + .. code-block:: bash + :name: code-block-example-bash0 # For training on multi node, run the following command in one of the nodes @@ -171,7 +173,7 @@ def launch(): .. code-block:: bash :name: code-block-example-bash5 - # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu. + # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu. export CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01 @@ -226,7 +228,7 @@ def launch(): python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:4 train.py # once the number of nodes changes between 2:4 during training, the strategy holds - + """ # initialize the context to run