diff --git a/paddle/scripts/cluster_train/conf.py b/paddle/scripts/cluster_train/conf.py deleted file mode 100644 index c77d7584d3c89144761875b0fbc70369e355930a..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train/conf.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -HOSTS = [ - "root@192.168.100.17", - "root@192.168.100.18", -] -''' -workspace configuration -''' -#root dir for workspace, can be set as any director with real user account -ROOT_DIR = "/home/paddle" -''' -network configuration -''' -#pserver nics -PADDLE_NIC = "eth0" -#pserver port -PADDLE_PORT = 7164 -#pserver ports num -PADDLE_PORTS_NUM = 2 -#pserver sparse ports num -PADDLE_PORTS_NUM_FOR_SPARSE = 2 - -#environments setting for all processes in cluster job -LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64" diff --git a/paddle/scripts/cluster_train/paddle.py b/paddle/scripts/cluster_train/paddle.py deleted file mode 100644 index ba313ac6a18fe22e1e14d2cce42320ab6d4fe398..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train/paddle.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/python -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" module for launching cluster job """ - -import os -import argparse -import socket -import copy -import time -import signal - -from fabric.api import run, put, settings, env, prefix -from fabric.tasks import execute - -#configuration for cluster -import conf - - -def refine_unknown_args(cmd_args): - ''' - refine unknown parameters to handle some special parameters - ''' - new_args = [] - for arg in cmd_args: - if arg.startswith("--") and arg.find("=") != -1: - equal_pos = arg.find("=") #find first = pos - arglist = list(arg) - arglist[equal_pos] = " " - arg = "".join(arglist) - arg = arg.lstrip("-") - new_args += arg.split(" ") - elif arg.startswith("--") and arg.find("=") == -1: - arg = arg.lstrip("-") - new_args.append(arg) - else: - new_args.append(arg) - return new_args - - -def kill_process(): - ''' - kill comments threads - ''' - run("ps aux \ - | grep paddle_process_by_paddle \ - | grep -v grep \ - | awk '{print $2}' \ - | xargs kill > /dev/null 2>&1") - - -def job_prepare(jobdir, data=None): - ''' - prepare job related workspace data - - Assuming you already installed PaddlePaddle in all nodes which means - PaddlePaddle related bins and dependencies libraries. - Assuming the train/test data have already been installed. - This function just prepare all related model and other resources - needed at runtime. - ''' - - def job_create_workspace(jobdir, data=None): - ''' - prepare job workspace, common file, etc. - ''' - log = os.path.join(jobdir, "log") - if data is not None: - #create job dir - run('rm ' + jobdir + ' -fr && ' + 'mkdir -p ' + jobdir) - #push data and paddle bin diff --git a/paddle/scripts/cluster_train/run.sh b/paddle/scripts/cluster_train/run.sh deleted file mode 100644 index 331c64988137745a5afab562e968d6f8dc122b85..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train/run.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh - -#python paddle.py \ -# --job_workspace="${PATH_TO_REMOTE_EXISTED_WORKSPACE}" \ -# --dot_period=10 \ -# --ports_num_for_sparse=2 \ -# --log_period=50 \ -# --num_passes=10 \ -# --trainer_count=4 \ -# --saving_period=1 \ -# --local=0 \ -# --config=./trainer_config.py \ -# --save_dir=./output \ -# --use_gpu=0 - -python paddle.py \ - --job_dispatch_package="${PATH_TO_LOCAL_WORKSPACE}" \ - --dot_period=10 \ - --ports_num_for_sparse=2 \ - --log_period=50 \ - --num_passes=10 \ - --trainer_count=4 \ - --saving_period=1 \ - --local=0 \ - --config=./trainer_config.py \ - --save_dir=./output \ - --use_gpu=0 diff --git a/paddle/scripts/cluster_train_v2/fabric/conf.py b/paddle/scripts/cluster_train_v2/fabric/conf.py deleted file mode 100644 index e96503d093a4317df7bb006043eb42098f51b6f5..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train_v2/fabric/conf.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -HOSTS = [ - "root@10.1.9.7", - "root@10.1.18.7", - "root@10.1.32.9", -] -''' -workspace configuration -''' -#root dir for workspace, can be set as any director with real user account -ROOT_DIR = "/root" -''' -network configuration -''' -#pserver nics -PADDLE_NIC = "eth0" -#pserver port -PADDLE_PORT = 7164 -#pserver ports num -PADDLE_PORTS_NUM = 1 -#pserver sparse ports num -PADDLE_PORTS_NUM_FOR_SPARSE = 1 -#trainer whether use gpu -PADDLE_USE_GPU = "False" -#environments setting for all processes in cluster job -LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64" diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile deleted file mode 100644 index 6606c01265af1fa8009e67906a3dbbe5c95ebc0d..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM docker.paddlepaddlehub.com/paddle:0.10.0rc2 -RUN apt-get update && apt-get install -y openssh-server -RUN mkdir /var/run/sshd - -RUN echo 'root:root' |chpasswd - -RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config -RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config - -EXPOSE 22 -CMD ["/usr/sbin/sshd", "-D"] diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml deleted file mode 100644 index 0784b2d1b8785796f94fff1607643218564fc126..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: extensions/v1beta1 -kind: Deployment -metadata: - name: ssh-servers -spec: - replicas: 3 - template: - metadata: - labels: - app: ssh-servers - spec: - containers: - - name: ssh-servers - image: docker.paddlepaddlehub.com/paddlessh - resources: - limits: - cpu: 500m - memory: 1Gi - requests: - cpu: 500m - memory: 1Gi - ports: - - containerPort: 22 diff --git a/paddle/scripts/cluster_train_v2/fabric/run.sh b/paddle/scripts/cluster_train_v2/fabric/run.sh deleted file mode 100644 index f6324bcb136803ebc30e69bcdaa2f8725cb0ccba..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train_v2/fabric/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -python paddle.py \ - --job_dispatch_package="/root/wuyi/fabric_submit/workspace" \ - --dot_period=10 \ - --ports_num_for_sparse=1 \ - --log_period=50 \ - --num_passes=5 \ - --trainer_count=2 \ - --saving_period=1 \ - --local=0 \ - --config=./trainer_config.py \ - --save_dir=./output \ - --use_gpu=0 diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile deleted file mode 100644 index c2f631bdf4ed52a5dfa3fbcf1157d0abbdeadb9b..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ -# Build this image: docker build -t mpi . -# - -FROM paddlepaddle/paddle:0.10.0rc3 - -ENV DEBIAN_FRONTEND noninteractive - -RUN apt-get update -y && \ - apt-get upgrade -y && \ - apt-get install -y openssh-server zip unzip vim sudo \ -gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev && \ -pip install mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy && \ -mkdir /var/run/sshd && \ -echo 'root:tutorial' | chpasswd && \ -sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ -# SSH login fix. Otherwise user is kicked off after login -sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \ -echo "export VISIBLE=now" >> /etc/profile && \ -adduser --disabled-password --gecos "" tutorial && \ -echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \ -mkdir /home/tutorial/.ssh/ - -ENV HOME /home/tutorial -ENV NOTVISIBLE "in users profile" - -# ------------------------------------------------------------ -# Set-Up SSH with our Github deploy key -# ------------------------------------------------------------ - -ADD ssh/config /home/tutorial/.ssh/config -ADD ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa -ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub -ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys - -#--------------------------------------------------------------- -#LD_LIBRARY_PATH -#--------------------------------------------------------------- - -RUN export LD_LIBRARY_PATH=/usr/lib/openmpi/lib/ - -WORKDIR /home/tutorial -EXPOSE 22 -CMD ["/usr/sbin/sshd", "-D"] diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml deleted file mode 100644 index 34835e5eb8d7cb92ad3cf7758a47c9e565a7dcf6..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: extensions/v1beta1 -kind: Deployment -metadata: - name: mpi-header - labels: - app: mpi-header -spec: - replicas: 1 - template: - metadata: - labels: - app: mpi-header - spec: - containers: - - image: typhoon1986/paddle-openmpi - name : mpi-header - resources: - limits: - cpu: 500m - memory: 2Gi - requests: - cpu: 500m - memory: 2Gi - ports: - - containerPort: 22 diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml deleted file mode 100644 index 2fd5cb4d44a25efac68dd8c9195dea9fd8f84a26..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: extensions/v1beta1 -kind: Deployment -metadata: - name: mpi-nodes - labels: - app: mpi-nodes -spec: - replicas: 3 - template: - metadata: - labels: - app: mpi-nodes - spec: - containers: - - image: typhoon1986/paddle-openmpi - name : mpi-nodes - resources: - limits: - cpu: 500m - memory: 2Gi - requests: - cpu: 500m - memory: 2Gi - ports: - - containerPort: 22 - imagePullPolicy: Always diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config deleted file mode 100644 index a9ecad07c39e4a9d6f0572d6cbf77795d99681f2..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config +++ /dev/null @@ -1 +0,0 @@ -StrictHostKeyChecking no diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi deleted file mode 100644 index 23768343edf5258cf525523d471f67071a24f5de..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi +++ /dev/null @@ -1,27 +0,0 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4 -1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y -O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk -36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE -mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6 -bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B -OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ -TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o -79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO -YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx -mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy -lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y -rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo -DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv -44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H -fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6 -cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn -g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K -yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm -PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp -v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8 -hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G -sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg -zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv -yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4= ------END RSA PRIVATE KEY----- diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub deleted file mode 100644 index 015f2b42e71920e00de090cbb1108d9a12ed5f0c..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub +++ /dev/null @@ -1 +0,0 @@ -ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi diff --git a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh deleted file mode 100644 index 2a7f46362749a68c341635bec1b34e72e3b86686..0000000000000000000000000000000000000000 --- a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# General trainning configurations - -NICS=eth0 -PADDLE_INIT_PORT=7164 -PADDLE_INIT_PORTS_NUM=1 -PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 -PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g') -PADDLE_INIT_USE_GPU=False - -PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE} -PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK} -PADDLE_CLUSTER_TRAIN=True - -env - -# start pserver -stdbuf -oL nohup paddle pserver \ - --port=$PADDLE_INIT_PORT \ - --ports_num=$PADDLE_INIT_PORTS_NUM \ - --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \ - --nics=$NICS \ - --comment=paddle_cluster_pserver \ - --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS \ - &> logs/pserver.log & - -# start trainer -# NOTE: train.py will use the above environment variables as configuration -python train.py &> logs/train.log - -# kill background pservers when train finishes -ps -ef | grep pserver | awk '{print $2}' | xargs kill