未验证 提交 0efa64c8 编写于 作者: Z zmxdream 提交者: GitHub

[GPUPS]Config fleet optimize 2 (#39783)

* update. test=develop

* update. test=develop

* fix. test=develop

* update. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update. test=develop

* update. test=develop
上级 85a11c47
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <google/protobuf/text_format.h>
#include <cstdlib> #include <cstdlib>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -20,6 +21,7 @@ limitations under the License. */ ...@@ -20,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB) (defined PADDLE_WITH_PSLIB)
...@@ -44,6 +46,164 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, ...@@ -44,6 +46,164 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
dense_grad_names_[table_id][j] = table.dense_grad_name(j); dense_grad_names_[table_id][j] = table.dense_grad_name(j);
} }
} }
// add for hbmps optimizer config
auto fleet_desc_str = trainer_desc.fleet_desc();
google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param);
auto sparse_table =
_ps_param.server_param().downpour_server_param().downpour_table_param(0);
auto sparse_table_accessor = sparse_table.accessor();
auto sparse_table_accessor_parameter =
sparse_table_accessor.downpour_accessor_param();
auto accessor_class = sparse_table_accessor.accessor_class();
// gpups' sparse table optimizer config
// now only support single sparse table
// auto sparse_table = param_.sparse_table(0);
std::unordered_map<std::string, float> config;
if (accessor_class == "DownpourFeatureValueAccessor" ||
accessor_class == "DownpourCtrAccessor" ||
accessor_class == "DownpourCtrDoubleAccessor") {
config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
config["learning_rate"] =
sparse_table_accessor.sparse_sgd_param().learning_rate();
config["initial_g2sum"] =
sparse_table_accessor.sparse_sgd_param().initial_g2sum();
config["initial_range"] =
sparse_table_accessor.sparse_sgd_param().initial_range();
if (sparse_table_accessor.sparse_sgd_param().weight_bounds_size() == 2) {
config["min_bound"] =
sparse_table_accessor.sparse_sgd_param().weight_bounds()[0];
config["max_bound"] =
sparse_table_accessor.sparse_sgd_param().weight_bounds()[1];
}
config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
} else if (accessor_class == "DownpourSparseValueAccessor") {
auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name();
if (optimizer_name == "naive") {
config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param()
.naive()
.learning_rate();
config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param()
.naive()
.initial_range();
if (sparse_table_accessor.sparse_commonsgd_param()
.naive()
.weight_bounds_size() == 2) {
config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.naive()
.weight_bounds()[0];
config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.naive()
.weight_bounds()[1];
}
} else if (optimizer_name == "adagrad") {
config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.learning_rate();
config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.initial_range();
config["initial_g2sum"] = sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.initial_g2sum();
if (sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.weight_bounds_size() == 2) {
config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.weight_bounds()[0];
config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.adagrad()
.weight_bounds()[1];
}
} else if (optimizer_name == "adam") {
config["learning_rate"] =
sparse_table_accessor.sparse_commonsgd_param().adam().learning_rate();
config["initial_range"] =
sparse_table_accessor.sparse_commonsgd_param().adam().initial_range();
if (sparse_table_accessor.sparse_commonsgd_param()
.adam()
.weight_bounds_size() == 2) {
config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.adam()
.weight_bounds()[0];
config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
.adam()
.weight_bounds()[1];
}
}
} else if (accessor_class == "DownpourUnitAccessor" ||
accessor_class == "DownpourDoubleUnitAccessor") {
config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name();
if (optimizer_name == "naive") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().naive().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().naive().initial_range();
if (sparse_table_accessor.embedx_sgd_param()
.naive()
.weight_bounds_size() == 2) {
config["mf_min_bound"] =
sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0];
config["mf_max_bound"] =
sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1];
}
} else if (optimizer_name == "adagrad") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
config["mf_initial_g2sum"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
if (sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds_size() == 2) {
config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[0];
config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[1];
}
} else if (optimizer_name == "std_adagrad") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
config["mf_initial_g2sum"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
if (sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds_size() == 2) {
config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[0];
config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[1];
}
} else if (optimizer_name == "adam") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().adam().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().adam().initial_range();
if (sparse_table_accessor.embedx_sgd_param()
.adam()
.weight_bounds_size() == 2) {
config["mf_min_bound"] =
sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0];
config["mf_max_bound"] =
sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1];
}
}
config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
}
auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
ps_gpu_wrapper->InitializeGPUServer(config);
scale_datanorm_ = trainer_desc.scale_datanorm(); scale_datanorm_ = trainer_desc.scale_datanorm();
int place_num = trainer_desc.worker_places_size(); int place_num = trainer_desc.worker_places_size();
const std::vector<paddle::framework::DataFeed*> readers = const std::vector<paddle::framework::DataFeed*> readers =
......
...@@ -36,6 +36,10 @@ limitations under the License. */ ...@@ -36,6 +36,10 @@ limitations under the License. */
#include "paddle/fluid/operators/reader/blocking_queue.h" #include "paddle/fluid/operators/reader/blocking_queue.h"
#include "paddle/phi/backends/dynload/port.h" #include "paddle/phi/backends/dynload/port.h"
#ifdef PADDLE_WITH_PSLIB
#include <pslib.h>
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -287,6 +291,9 @@ class PSGPUTrainer : public TrainerBase { ...@@ -287,6 +291,9 @@ class PSGPUTrainer : public TrainerBase {
int mpi_rank_; int mpi_rank_;
int mpi_size_; int mpi_size_;
int dump_file_num_; int dump_file_num_;
// _ps_param for gpups optimizer config
::paddle::PSParameter _ps_param;
}; };
#endif #endif
......
...@@ -66,6 +66,9 @@ message TrainerDesc { ...@@ -66,6 +66,9 @@ message TrainerDesc {
repeated int32 trainers = 35; repeated int32 trainers = 35;
optional int32 trainer_id = 36; optional int32 trainer_id = 36;
// add for gpu
optional string fleet_desc = 37;
// device worker parameters // device worker parameters
optional HogwildWorkerParameter hogwild_param = 101; optional HogwildWorkerParameter hogwild_param = 101;
optional DownpourWorkerParameter downpour_param = 103; optional DownpourWorkerParameter downpour_param = 103;
......
...@@ -111,6 +111,10 @@ class TrainerDesc(object): ...@@ -111,6 +111,10 @@ class TrainerDesc(object):
def _set_fleet_desc(self, fleet_desc): def _set_fleet_desc(self, fleet_desc):
self._fleet_desc = fleet_desc self._fleet_desc = fleet_desc
## serialize fleet_desc
from google.protobuf import text_format
fleet_desc_str = text_format.MessageToString(fleet_desc)
self.proto_desc.fleet_desc = fleet_desc_str
def _gen_trainer_desc(self): def _gen_trainer_desc(self):
pass pass
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册