diff --git a/demo/quick_start/cluster/cluster_train.sh b/demo/quick_start/cluster/cluster_train.sh index aac9b89b14b98ac8e2db7def19e5f06c01682493..a7b1f01064b29cf6abc4cd6b706ee466a6d6da36 100755 --- a/demo/quick_start/cluster/cluster_train.sh +++ b/demo/quick_start/cluster/cluster_train.sh @@ -25,6 +25,7 @@ log_file="$bin_dir/train.log" pushd "$home_dir" cfg=trainer_config.lr.py paddle train \ + --start_pserver=false \ --config=$cfg \ --save_dir=${model_dir} \ --trainer_count=4 \ diff --git a/demo/quick_start/cluster/pserver.sh b/demo/quick_start/cluster/pserver.sh index b187c1d9b9108a607ed310253d54ecc096f0e792..4e1ffe5139e27b4f1209e6b22b42e17d0bbc1b0c 100755 --- a/demo/quick_start/cluster/pserver.sh +++ b/demo/quick_start/cluster/pserver.sh @@ -19,7 +19,7 @@ source "$bin_dir/env.sh" paddle pserver \ --nics=`get_nics` \ --port=7164 \ - --ports_num=1 \ + --ports_num=2 \ --ports_num_for_sparse=1 \ --num_gradient_servers=1 \ --comment="paddle_pserver" \ diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt index 1c1e1964b8d3fd83c801f3988760a72dfc032e7f..9bc48294f06b8ae38ff87fd3542f26c14a0e7795 100644 --- a/paddle/pserver/CMakeLists.txt +++ b/paddle/pserver/CMakeLists.txt @@ -24,13 +24,15 @@ set(PSERVER_SOURCES BaseClient.cpp ParameterClient2.cpp ParameterServer2.cpp - SparseParameterDistribution.cpp) + SparseParameterDistribution.cpp + PServerUtil.cpp) set(PSERVER_HEADERS BaseClient.h ParameterClient2.h ParameterServer2.h - SparseParameterDistribution.h) + SparseParameterDistribution.h + PServerUtil.h) add_library(paddle_pserver STATIC ${PSERVER_SOURCES}) diff --git a/paddle/pserver/PServerUtil.cpp b/paddle/pserver/PServerUtil.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e64569793613cf4c9ed38152d081bd450086dcdd --- /dev/null +++ b/paddle/pserver/PServerUtil.cpp @@ -0,0 +1,101 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "PServerUtil.h" + +namespace paddle { + +ParameterServerConfig* PServerUtil::initConfig() { + ParameterServerConfig* config = new ParameterServerConfig(); + config->set_nics(FLAGS_nics); + config->set_port(FLAGS_port); + config->set_ports_num(FLAGS_ports_num); + config->set_rdma_tcp(FLAGS_rdma_tcp); + return config; +} + +PServerUtil* PServerUtil::create() { + auto& pServerConfig = *paddle::PServerUtil::initConfig(); + return PServerUtil::create(pServerConfig); +} + +PServerUtil* PServerUtil::create(const ParameterServerConfig& config) { + return new PServerUtil(config); +} + +PServerUtil::PServerUtil(const ParameterServerConfig& config) { + // round robin to load balance RDMA server ENGINE + std::vector devices; + int rdmaCpu = 0; + int onlineCpus = rdma::numCpus(); + ; + int numPorts = config.ports_num() + config.ports_num_for_sparse(); + + if (FLAGS_nics.empty()) { + pservers_.resize(numPorts); + for (int i = 0; i < numPorts; ++i) { + if (FLAGS_rdma_tcp == "rdma") { + pservers_[i].reset( + new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++)); + rdmaCpu = rdmaCpu % onlineCpus; + } else { + pservers_[i].reset(new ParameterServer2(std::string(), FLAGS_port + i)); + } + CHECK(pservers_[i]->init()) << "Fail to initialize parameter server" + << FLAGS_port + i; + } + } else { + str::split(FLAGS_nics, ',', &devices); + pservers_.resize(devices.size() * numPorts); + for (int i = 0; i < numPorts; ++i) { + for (size_t j = 0; j < devices.size(); ++j) { + if (FLAGS_rdma_tcp == "rdma") { + pservers_[i * devices.size() + j].reset(new ParameterServer2( + getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++)); + rdmaCpu = rdmaCpu % onlineCpus; + } else { + pservers_[i * devices.size() + j].reset( + new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i)); + } + CHECK(pservers_[i * devices.size() + j]->init()) + << "Fail to initialize parameter server" << devices[j] + << FLAGS_port + i; + } + } + } +} + +PServerUtil::~PServerUtil() { this->join(); } + +void PServerUtil::start() { + LOG(INFO) << "pserver sizes : " << pservers_.size(); + int i = 0; + for (const auto& pserver : pservers_) { + LOG(INFO) << "pserver started : " << i; + pserver->start(); + i++; + } +} + +void PServerUtil::join() { + LOG(INFO) << "pserver sizes : " << pservers_.size(); + int i = 0; + for (const auto& pserver : pservers_) { + LOG(INFO) << "pserver join : " << i; + pserver->join(); + i++; + } +} + +} // namespace paddle diff --git a/paddle/pserver/PServerUtil.h b/paddle/pserver/PServerUtil.h new file mode 100644 index 0000000000000000000000000000000000000000..dd8d32a4e9bc6a957ec4af0e173099cb4d1c3603 --- /dev/null +++ b/paddle/pserver/PServerUtil.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "ParameterServer2.h" +#include "ParameterServerConfig.pb.h" +#include "RDMANetwork.h" +#include "paddle/utils/StringUtil.h" + +namespace paddle { + +class PServerUtil { +public: + DISABLE_COPY(PServerUtil); + static PServerUtil* create(); + static PServerUtil* create(const ParameterServerConfig& config); + explicit PServerUtil(const ParameterServerConfig& config); + ~PServerUtil(); + static ParameterServerConfig* initConfig(); + void start(); + void join(); + +private: + std::vector> pservers_; +}; + +} // namespace paddle diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp index ffc521f2c143d95ff07c3825e0a746cb31743d9b..afba7293eb8c99ff80378e853593806c37489c00 100644 --- a/paddle/pserver/ParameterServer2Main.cpp +++ b/paddle/pserver/ParameterServer2Main.cpp @@ -13,66 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "paddle/utils/StringUtil.h" -#include "paddle/utils/Util.h" - -#include "ParameterServer2.h" -#include "RDMANetwork.h" -#include "paddle/utils/Flags.h" +#include "PServerUtil.h" +#include "paddle/trainer/ParamUtil.h" using namespace paddle; // NOLINT int main(int argc, char** argv) { initMain(argc, argv); - std::vector devices; - std::vector> pservers; - - // round robin to loadbalance RDMA server ENGINE - int rdmaCpu = 0; - int onlineCpus = rdma::numCpus(); - int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse; - if (FLAGS_nics.empty()) { - pservers.resize(numPorts); - for (int i = 0; i < numPorts; ++i) { - if (FLAGS_rdma_tcp == "rdma") { - pservers[i].reset( - new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++)); - rdmaCpu = rdmaCpu % onlineCpus; - } else { - pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i)); - } - CHECK(pservers[i]->init()) << "Fail to initialize parameter server" - << FLAGS_port + i; - LOG(INFO) << "pserver started : " << FLAGS_port + i; - pservers[i]->start(); - } - } else { - str::split(FLAGS_nics, ',', &devices); - pservers.resize(devices.size() * numPorts); - for (int i = 0; i < numPorts; ++i) { - for (size_t j = 0; j < devices.size(); ++j) { - if (FLAGS_rdma_tcp == "rdma") { - pservers[i * devices.size() + j].reset(new ParameterServer2( - getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++)); - rdmaCpu = rdmaCpu % onlineCpus; - } else { - pservers[i * devices.size() + j].reset( - new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i)); - } - CHECK(pservers[i * devices.size() + j]->init()) - << "Fail to initialize parameter server" << devices[j] - << FLAGS_port + i; - LOG(INFO) << "pserver started : " << devices[j] << ":" - << FLAGS_port + i; - pservers[i * devices.size() + j]->start(); - } - } - } - - for (auto& pserver : pservers) { - pserver->join(); - } + std::unique_ptr pServerPtr(paddle::PServerUtil::create()); + pServerPtr->start(); + pServerPtr->join(); return 0; } diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp index 947f9cadcc983d58ce31ef462e51dc42e41eaf1b..0d3e4514d6b007506fcd7cd8f1532ee918ab2253 100644 --- a/paddle/trainer/TrainerMain.cpp +++ b/paddle/trainer/TrainerMain.cpp @@ -13,14 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "paddle/pserver/ParameterServer2.h" +#include "paddle/pserver/PServerUtil.h" #include "paddle/utils/Excepts.h" #include "paddle/utils/PythonUtil.h" -#include "paddle/utils/StringUtil.h" #include "ParamUtil.h" #include "Trainer.h" -#include "paddle/pserver/RDMANetwork.h" DEFINE_bool(start_pserver, false, "Whether to start pserver"); DECLARE_int32(gpu_id); @@ -39,54 +37,9 @@ int main(int argc, char** argv) { initMain(argc, argv); initPython(argc, argv); - std::vector> pservers; - std::vector devices; - if (FLAGS_start_pserver) { - // round robin to loadbalance RDMA server ENGINE - int rdmaCpu = 0; - int onlineCpus = rdma::numCpus(); - int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse; - if (FLAGS_nics.empty()) { - pservers.resize(numPorts); - for (int i = 0; i < numPorts; ++i) { - if (FLAGS_rdma_tcp == "rdma") { - pservers[i].reset( - new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++)); - rdmaCpu = rdmaCpu % onlineCpus; - } else { - pservers[i].reset( - new ParameterServer2(std::string(), FLAGS_port + i)); - } - - CHECK(pservers[i]->init()) << "Fail to initialize parameter server" - << FLAGS_port + i; - LOG(INFO) << "pserver started : " << FLAGS_port + i; - pservers[i]->start(); - } - } else { - str::split(FLAGS_nics, ',', &devices); - pservers.resize(devices.size() * numPorts); - for (int i = 0; i < numPorts; ++i) { - for (size_t j = 0; j < devices.size(); ++j) { - if (FLAGS_rdma_tcp == "rdma") { - pservers[i * devices.size() + j].reset(new ParameterServer2( - getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++)); - rdmaCpu = rdmaCpu % onlineCpus; - } else { - pservers[i * devices.size() + j].reset( - new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i)); - } - - CHECK(pservers[i * devices.size() + j]->init()) - << "Fail to initialize parameter server" << devices[j] - << FLAGS_port + i; - LOG(INFO) << "pserver started : " << devices[j] << ":" - << FLAGS_port + i; - pservers[i * devices.size() + j]->start(); - } - } - } + PServerUtil* pServerUtil = paddle::PServerUtil::create(); + pServerUtil->start(); } Trainer trainer; auto config = TrainerConfigHelper::createFromFlags(); diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt index 2c40070eca44d8656d7ce82157a1b840092b9965..e53d06e773aeb1a83ff04558ede9547a6255441c 100644 --- a/proto/CMakeLists.txt +++ b/proto/CMakeLists.txt @@ -4,7 +4,8 @@ set(proto_filenames ModelConfig.proto ParameterConfig.proto ParameterService.proto - TrainerConfig.proto) + TrainerConfig.proto + ParameterServerConfig.proto) set(PROTO_GEN) set(PROTO_GEN_PY) diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto new file mode 100644 index 0000000000000000000000000000000000000000..b4fbf901c20cce58a5f1819c05d3518902c4c165 --- /dev/null +++ b/proto/ParameterServerConfig.proto @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +syntax = "proto2"; + +package paddle; + +message ParameterClientConfig { + required int32 trainer_id = 1; +} + +message ParameterServerConfig { + // The ports number for parameter send, + // increment based on default port number + required int32 ports_num = 1 [default = 1]; + // The ports number for parameter send, + // increment based on default (port + ports_num + required int32 ports_num_for_sparse = 2 [default = 0]; + // network device name for pservers + required string nics = 3 [default = "xgbe0,xgbe1"]; + required string rdma_tcp = 4 [default = "tcp"]; + // Listening port for pserver + required int32 port = 5 [default = 20134]; + // number of gradient servers + required int32 num_gradient_servers = 6 [default = 1]; + // number of threads for sync op exec + required int32 pserver_num_threads = 7 [default = 1]; + // control config_.async_lagged_grad_discard_ratio() min value + required double async_lagged_ratio_min = 8 [default = 1.0]; + // if async_lagged_grad_discard_ratio is not set in trainer_config.conf + // use it as defalut value + required double async_lagged_ratio_default = 9 [default = 1.5]; +} \ No newline at end of file