remove legacy cluster_train code

5316c647 · Tao Luo · eec133ca · eec133ca · eec133ca · eec133ca
14 changed file
--- a/paddle/scripts/cluster_train/conf.py
+++ b/paddle/scripts/cluster_train/conf.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-HOSTS = [
-    "root@192.168.100.17",
-    "root@192.168.100.18",
-]
-'''
-workspace configuration
-'''
-#root dir for workspace, can be set as any director with real user account
-ROOT_DIR = "/home/paddle"
-'''
-network configuration
-'''
-#pserver nics
-PADDLE_NIC = "eth0"
-#pserver port
-PADDLE_PORT = 7164
-#pserver ports num
-PADDLE_PORTS_NUM = 2
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 2
-#environments setting for all processes in cluster job
-LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
--- a/paddle/scripts/cluster_train/paddle.py
+++ b/paddle/scripts/cluster_train/paddle.py
-#!/usr/bin/python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" module for launching cluster job """
-import os
-import argparse
-import socket
-import copy
-import time
-import signal
-from fabric.api import run, put, settings, env, prefix
-from fabric.tasks import execute
-#configuration for cluster
-import conf
-def refine_unknown_args(cmd_args):
-    '''
-    refine unknown parameters to handle some special parameters
-    '''
-    new_args = []
-    for arg in cmd_args:
-        if arg.startswith("--") and arg.find("=") != -1:
-            equal_pos = arg.find("=")  #find first = pos
-            arglist = list(arg)
-            arglist[equal_pos] = " "
-            arg = "".join(arglist)
-            arg = arg.lstrip("-")
-            new_args += arg.split(" ")
-        elif arg.startswith("--") and arg.find("=") == -1:
-            arg = arg.lstrip("-")
-            new_args.append(arg)
-        else:
-            new_args.append(arg)
-    return new_args
-def kill_process():
-    '''
-    kill comments threads
-    '''
-    run("ps aux \
-         | grep paddle_process_by_paddle \
-         | grep -v grep  \
-         | awk '{print $2}' \
-         | xargs kill > /dev/null 2>&1")
-def job_prepare(jobdir, data=None):
-    '''
-    prepare job related workspace data
-    Assuming you already installed PaddlePaddle in all nodes which means
-    PaddlePaddle related bins and dependencies libraries.
-    Assuming the train/test data have already been installed.
-    This function just prepare all related model and other resources
-    needed at runtime.
-    '''
-    def job_create_workspace(jobdir, data=None):
-        '''
-        prepare job workspace, common file, etc.
-        '''
-        log = os.path.join(jobdir, "log")
-        if data is not None:
-            #create job dir
-            run('rm ' + jobdir + ' -fr && ' + 'mkdir -p ' + jobdir)
-            #push data and paddle bin
--- a/paddle/scripts/cluster_train/run.sh
+++ b/paddle/scripts/cluster_train/run.sh
-#!/bin/sh
-#python paddle.py \
-#  --job_workspace="${PATH_TO_REMOTE_EXISTED_WORKSPACE}" \
-#  --dot_period=10 \
-#  --ports_num_for_sparse=2 \
-#  --log_period=50 \
-#  --num_passes=10 \
-#  --trainer_count=4 \
-#  --saving_period=1 \
-#  --local=0 \
-#  --config=./trainer_config.py \
-#  --save_dir=./output \
-#  --use_gpu=0
-python paddle.py \
-  --job_dispatch_package="${PATH_TO_LOCAL_WORKSPACE}" \
-  --dot_period=10 \
-  --ports_num_for_sparse=2 \
-  --log_period=50 \
-  --num_passes=10 \
-  --trainer_count=4 \
-  --saving_period=1 \
-  --local=0 \
-  --config=./trainer_config.py \
-  --save_dir=./output \
-  --use_gpu=0
--- a/paddle/scripts/cluster_train_v2/fabric/conf.py
+++ b/paddle/scripts/cluster_train_v2/fabric/conf.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-HOSTS = [
-    "root@10.1.9.7",
-    "root@10.1.18.7",
-    "root@10.1.32.9",
-]
-'''
-workspace configuration
-'''
-#root dir for workspace, can be set as any director with real user account
-ROOT_DIR = "/root"
-'''
-network configuration
-'''
-#pserver nics
-PADDLE_NIC = "eth0"
-#pserver port
-PADDLE_PORT = 7164
-#pserver ports num
-PADDLE_PORTS_NUM = 1
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 1
-#trainer whether use gpu
-PADDLE_USE_GPU = "False"
-#environments setting for all processes in cluster job
-LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
--- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
-FROM docker.paddlepaddlehub.com/paddle:0.10.0rc2
-RUN apt-get update && apt-get install -y openssh-server
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' |chpasswd
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
--- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
-apiVersion: extensions/v1beta1
-kind: Deployment
-metadata:
-  name: ssh-servers
-spec:
-  replicas: 3
-  template:
-    metadata:
-      labels:
-        app: ssh-servers
-    spec:
-      containers:
-      - name: ssh-servers
-        image: docker.paddlepaddlehub.com/paddlessh
-        resources:
-          limits:
-            cpu: 500m
-            memory: 1Gi
-          requests:
-            cpu: 500m
-            memory: 1Gi
-        ports:
-        - containerPort: 22
--- a/paddle/scripts/cluster_train_v2/fabric/run.sh
+++ b/paddle/scripts/cluster_train_v2/fabric/run.sh
-#!/bin/bash
-python paddle.py \
-  --job_dispatch_package="/root/wuyi/fabric_submit/workspace" \
-  --dot_period=10 \
-  --ports_num_for_sparse=1 \
-  --log_period=50 \
-  --num_passes=5 \
-  --trainer_count=2 \
-  --saving_period=1 \
-  --local=0 \
-  --config=./trainer_config.py \
-  --save_dir=./output \
-  --use_gpu=0
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
-# Build this image:  docker build -t mpi .
-#
-FROM paddlepaddle/paddle:0.10.0rc3
-ENV DEBIAN_FRONTEND noninteractive
-RUN apt-get update -y && \
-    apt-get upgrade -y && \
-    apt-get install -y openssh-server zip unzip vim sudo \
-gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev && \
-pip install mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy && \
-mkdir /var/run/sshd && \
-echo 'root:tutorial' | chpasswd && \
-sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
-# SSH login fix. Otherwise user is kicked off after login
-sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
-echo "export VISIBLE=now" >> /etc/profile && \
-adduser --disabled-password --gecos "" tutorial && \
-echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
-mkdir /home/tutorial/.ssh/
-ENV HOME /home/tutorial
-ENV NOTVISIBLE "in users profile"
-# ------------------------------------------------------------
-# Set-Up SSH with our Github deploy key
-# ------------------------------------------------------------
-ADD ssh/config /home/tutorial/.ssh/config
-ADD ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
-ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
-ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
-#---------------------------------------------------------------
-#LD_LIBRARY_PATH
-#---------------------------------------------------------------
-RUN export LD_LIBRARY_PATH=/usr/lib/openmpi/lib/
-WORKDIR /home/tutorial
-EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
-apiVersion: extensions/v1beta1
-kind: Deployment
-metadata:
-  name: mpi-header
-  labels:
-    app: mpi-header
-spec:
-  replicas: 1
-  template:
-    metadata:
-      labels:
-        app: mpi-header
-    spec:
-      containers:
-      - image: typhoon1986/paddle-openmpi
-        name : mpi-header
-        resources:
-          limits:
-            cpu: 500m
-            memory: 2Gi
-          requests:
-            cpu: 500m
-            memory: 2Gi
-        ports:
-        - containerPort: 22
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
-apiVersion: extensions/v1beta1
-kind: Deployment
-metadata:
-  name: mpi-nodes
-  labels:
-    app: mpi-nodes
-spec:
-  replicas: 3
-  template:
-    metadata:
-      labels:
-        app: mpi-nodes
-    spec:
-      containers:
-      - image: typhoon1986/paddle-openmpi
-        name : mpi-nodes
-        resources:
-          limits:
-            cpu: 500m
-            memory: 2Gi
-          requests:
-            cpu: 500m
-            memory: 2Gi
-        ports:
-        - containerPort: 22
-        imagePullPolicy: Always
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
-StrictHostKeyChecking no
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
-----BEGIN RSA PRIVATE KEY-----
-MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
-1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
-O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
-36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
-mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
-bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
-OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
-TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
-79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
-YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
-mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
-lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
-rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
-DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
-44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
-fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
-cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
-g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
-yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
-PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
-v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
-hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
-sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
-zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
-yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
-----END RSA PRIVATE KEY-----
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
-ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
--- a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
+++ b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
-#!/bin/bash
-# General trainning configurations
-NICS=eth0
-PADDLE_INIT_PORT=7164
-PADDLE_INIT_PORTS_NUM=1
-PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
-PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g')
-PADDLE_INIT_USE_GPU=False
-PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE}
-PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
-PADDLE_CLUSTER_TRAIN=True
-env
-# start pserver
-stdbuf -oL nohup paddle pserver \
-  --port=$PADDLE_INIT_PORT \
-  --ports_num=$PADDLE_INIT_PORTS_NUM \
-  --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
-  --nics=$NICS \
-  --comment=paddle_cluster_pserver \
-  --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS \
-  &> logs/pserver.log &
-# start trainer
-# NOTE: train.py will use the above environment variables as configuration
-python train.py &> logs/train.log
-# kill background pservers when train finishes
-ps -ef | grep pserver | awk '{print $2}' | xargs kill