提交 5dcfb699 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

......@@ -90,9 +90,9 @@ endif()
if (WITH_ASCEND_CL)
macro(find_ascend_toolkit_version ascend_toolkit_version_info)
file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS)
string(REGEX MATCH "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
string(REGEX REPLACE "[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}")
if(NOT ASCEND_TOOLKIT_VERSION)
set(ASCEND_TOOLKIT_VERSION "???")
......
......@@ -238,7 +238,7 @@ int32_t BrpcPsClient::initialize() {
std::thread(std::bind(&BrpcPsClient::push_dense_task_consume, this));
// for debug
// _print_thread =
// std::thread(std::bind(&BrpcPsClient::print_queue_size_thread, this));
// std::thread(std::bind(&BrpcPsClient::print_queue_size_thread, this));
return 0;
}
......@@ -1315,11 +1315,11 @@ std::future<int32_t> BrpcPsClient::push_sparse(size_t table_id,
CostTimer parse_timer("pserver_client_push_sparse_parse");
int push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
while (push_sparse_async_num > FLAGS_pserver_max_async_call_num) {
// LOG(INFO) << "push_sparse Waiting for async_call_num comsume, task_num:"
// << push_sparse_async_num << ", max_task_limit:" <<
// FLAGS_pserver_max_async_call_num;
// LOG(INFO) << "push_sparse Waiting for async_call_num comsume,
// task_num:"
// << push_sparse_async_num
// << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
usleep(5000); // 5ms
// push_sparse_async_num = _push_sparse_task_queue_map[table_id]->size();
push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
}
auto put_timer = std::make_shared<CostTimer>("client_push_sparse_put");
......@@ -1381,8 +1381,7 @@ void BrpcPsClient::push_sparse_task_consume() {
::ThreadPool async_push_sparse_shard_threads(
FLAGS_pserver_sparse_merge_thread);
while (_running) {
platform::Timer timeline;
timeline.Start();
auto async_start_time_ms = butil::gettimeofday_ms();
// 所有sparseTable的pushTask 进行处理
for (auto &push_sparse_task_itr : _push_sparse_task_queue_map) {
auto table_id = push_sparse_task_itr.first;
......@@ -1497,9 +1496,8 @@ void BrpcPsClient::push_sparse_task_consume() {
std::vector<std::future<int>>().swap(merge_status);
}
}
timeline.Pause();
auto wait_ms =
FLAGS_pserver_async_push_sparse_interval_ms - (timeline.ElapsedMS());
auto wait_ms = FLAGS_pserver_async_push_sparse_interval_ms -
(butil::gettimeofday_ms() - async_start_time_ms);
if (wait_ms > 0) {
usleep(wait_ms * 1000);
}
......@@ -1661,9 +1659,10 @@ std::future<int32_t> BrpcPsClient::push_dense(const Region *regions,
std::make_shared<CostTimer>("pserver_client_push_dense_parse");
int push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
while (push_dense_async_num > FLAGS_pserver_max_async_call_num) {
LOG(INFO) << "push_dense Waiting for async_call_num comsume, task_num:"
<< push_dense_async_num
<< ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
// LOG(INFO) << "push_dense Waiting for async_call_num comsume,
// task_num:"
// << push_dense_async_num
// << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
usleep(5000); // 5ms
push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
}
......@@ -1701,8 +1700,7 @@ void BrpcPsClient::push_dense_task_consume() {
static bool scale_gradient = FLAGS_pserver_scale_gradient_by_merge;
::ThreadPool async_merge_dense_threads(10);
while (_running) {
platform::Timer timeline;
timeline.Start();
auto async_start_time_ms = butil::gettimeofday_ms();
for (auto &task_queue_itr : _push_dense_task_queue_map) {
auto &task_queue = task_queue_itr.second;
auto queue_size = task_queue->Size();
......@@ -1791,9 +1789,8 @@ void BrpcPsClient::push_dense_task_consume() {
push_dense_raw_gradient(task_ptr, total_send_data, total_send_data_size,
closure);
}
timeline.Pause();
auto wait_ms =
FLAGS_pserver_async_push_dense_interval_ms - (timeline.ElapsedMS());
auto wait_ms = FLAGS_pserver_async_push_dense_interval_ms -
(butil::gettimeofday_ms() - async_start_time_ms);
if (wait_ms > 0) {
usleep(wait_ms * 1000);
}
......
......@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include <google/protobuf/text_format.h>
#include "gflags/gflags.h"
#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h"
......@@ -66,34 +65,9 @@ std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
void Communicator::InitBrpcClient(
const std::string &dist_desc,
const std::vector<std::string> &host_sign_list) {
// not used, just for psclient's init
std::map<uint64_t, std::vector<paddle::distributed::Region>>
_dense_pull_regions;
for (auto &iter : recv_varname_to_ctx_) {
auto tid = iter.first;
auto var_names = iter.second;
auto &regions = _dense_pull_regions[tid];
regions.reserve(var_names.size());
for (auto &t : var_names) {
Variable *var = recv_scope_->FindVar(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>();
float *w = tensor->data<float>();
paddle::distributed::Region reg(w, tensor->numel());
regions.emplace_back(std::move(reg));
}
}
auto fleet = paddle::distributed::FleetWrapper::GetInstance();
if (_worker_ptr.get() == nullptr) {
google::protobuf::TextFormat::ParseFromString(dist_desc, &_ps_param);
init_gflag(_ps_param.init_gflags());
servers_ = host_sign_list.size();
_ps_env = paddle::distributed::PaddlePSEnvironment();
_ps_env.set_ps_servers(&host_sign_list, servers_);
_worker_ptr = std::unique_ptr<paddle::distributed::PSClient>(
paddle::distributed::PSClientFactory::create(_ps_param));
_worker_ptr->configure(_ps_param, _dense_pull_regions, _ps_env,
trainer_id_);
_worker_ptr = fleet->worker_ptr_;
}
return;
}
......@@ -146,11 +120,11 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,
for (auto &t : varnames) {
Variable *var = scope->FindVar(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>();
VLOG(1) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
VLOG(3) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
<< platform::is_gpu_place(tensor->place());
float *temp_recv_data = tensor->mutable_data<float>(platform::CPUPlace());
VLOG(1) << "AsyncCommunicator::RpcRecvDense Var " << t << " table_id "
VLOG(3) << "AsyncCommunicator::RpcRecvDense Var " << t << " table_id "
<< table_id << " Temp_data[0] " << temp_recv_data[0]
<< " Temp_data[-1] " << temp_recv_data[tensor->numel() - 1];
if (platform::is_gpu_place(tensor->place())) {
......@@ -481,7 +455,7 @@ void AsyncCommunicator::RecvNoBarrier() {
for (auto &t : var_names) {
Variable *var = recv_scope_->FindVar(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>();
VLOG(1) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
VLOG(3) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
<< platform::is_gpu_place(tensor->place());
if (platform::is_gpu_place(tensor->place())) {
#ifdef PADDLE_WITH_CUDA
......@@ -653,7 +627,7 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0];
if (batch_size == -1) {
batch_size = cur_batch_size;
} else {
} else if (batch_size != cur_batch_size) {
// CHECK(batch_size == cur_batch_size); // NOLINT
batch_size_consist = false;
break;
......@@ -676,7 +650,8 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
size_t output_len = 0;
size_t input_idx = 0;
VLOG(2) << "fleet.cc::emb_dim: " << fea_dim;
VLOG(2) << "fleet.cc::emb_dim: " << fea_dim << " batch_size: " << batch_size
<< " batch_size_consist: " << batch_size_consist;
// TODO(zhaocaibei123): check type of show/clk is int? float? uint64?
// const long int* show_tensor = shows->data<int64_t>();
......@@ -687,13 +662,14 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
for (size_t index = 0; index < inputs->size(); ++index) {
framework::LoDTensor *g_tensor = outputs->at(index);
float *g = g_tensor->data<float>();
// no cvm
if (batch_size_consist) { // TODO(zhaocaibei123): add config
// scale_sparse_gradient_with_batch_size_
Eigen::Map<
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
g_mat(g, g_tensor->numel() / fea_dim, fea_dim);
g_mat.rightCols(fea_dim) *= batch_size;
g_mat.rightCols(fea_dim - 2) *=
batch_size; // hard code here, because of cvm_grad op
}
const framework::LoDTensor *tensor = inputs->at(index);
......@@ -710,16 +686,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
continue;
}
push_keys.emplace_back(real_id);
push_values.emplace_back(fea_dim + 3);
push_values.emplace_back(fea_dim + 1);
// slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h
push_values.back()[0] = 2; // TODO(zhaocaibei123): slot
push_values.back()[1] =
(i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
push_values.back()[2] =
(i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
// push_values.back()[1] =
// (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
// push_values.back()[2] =
// (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
float *data = push_values.back().data() + 3;
float *data = push_values.back().data() + 1; // hard code here
memcpy(data, g + output_len, sizeof(float) * fea_dim);
......@@ -733,16 +709,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
continue;
}
push_keys.emplace_back(real_id);
push_values.emplace_back(fea_dim + 3);
push_values.emplace_back(fea_dim + 1);
// slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h
push_values.back()[0] = 2; // TODO(zhaocaibei123): slot
push_values.back()[1] =
(i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
push_values.back()[2] =
(i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
// push_values.back()[1] =
// (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
// push_values.back()[2] =
// (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
float *data = push_values.back().data() + 3;
float *data = push_values.back().data() + 1;
memcpy(data, g + output_len, sizeof(float) * fea_dim);
......@@ -837,7 +813,7 @@ void AsyncCommunicator::Stop() {
if (!communicator_) {
VLOG(0) << "Communicator is not inited, do nothing";
} else {
_worker_ptr->finalize_worker();
// _worker_ptr->finalize_worker();
VLOG(1) << "client finalize_worker done";
if (recv_thread_) {
VLOG(1) << "stop recv thread";
......
......@@ -360,13 +360,13 @@ class Communicator {
PSClient *GetPsClient() { return _worker_ptr.get(); }
std::unique_ptr<paddle::distributed::PSClient> GetPsClientPtr() {
std::shared_ptr<paddle::distributed::PSClient> GetPsClientPtr() {
return std::move(_worker_ptr);
}
RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; }
std::unique_ptr<PSClient> _worker_ptr; // pointer to worker
std::shared_ptr<PSClient> _worker_ptr; // pointer to worker
protected:
bool running_ = false;
......
......@@ -43,11 +43,12 @@ set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPI
set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(ctr_double_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(sparse_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(downpour_ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
cc_library(ctr_double_accessor SRCS ctr_double_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(ctr_accessor SRCS ctr_accessor.cc sparse_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(downpour_ctr_accessor SRCS downpour_ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)
......
......@@ -115,6 +115,8 @@ int32_t CommonDenseTable::initialize_optimizer() {
// optimizer_->set_global_lr(_global_lr); //no use
} else if (name == "sum") {
optimizer_ = std::make_shared<DSUM>(common, &values_);
} else if (name == "summary") {
optimizer_ = std::make_shared<DSummary>(common, &values_);
} else {
VLOG(0) << "init optimizer failed";
}
......@@ -339,19 +341,27 @@ int32_t CommonDenseTable::save(const std::string& path,
auto common = _config.common();
int size = static_cast<int>(common.params().size());
std::ostringstream os;
for (int x = 0; x < size; ++x) {
auto& varname = common.params()[x];
auto& dim = common.dims()[x];
VLOG(0) << "CommonDenseTable::save dim " << x << " size: " << dim;
for (int y = 0; y < dim; ++y) {
os.clear();
os.str("");
os << values_[x][y];
if (dim == param_dim_) {
result_buffer_param[y].emplace_back(std::move(os.str()));
} else {
result_buffer_fixed_len.emplace_back(std::move(os.str()));
if (_config.common().name() == "summary") {
for (int x = 0; x < param_dim_; ++x) {
result_buffer_param[x].emplace_back(
std::to_string(values_[param_idx_][x]));
}
} else {
std::ostringstream os;
for (int x = 0; x < size; ++x) {
auto& varname = common.params()[x];
auto& dim = common.dims()[x];
VLOG(3) << "CommonDenseTable::save dim " << x << " size: " << dim;
for (int y = 0; y < dim; ++y) {
os.clear();
os.str("");
os << values_[x][y];
if (dim == param_dim_) {
result_buffer_param[y].emplace_back(std::move(os.str()));
} else {
result_buffer_fixed_len.emplace_back(std::move(os.str()));
}
}
}
}
......
......@@ -65,7 +65,7 @@ size_t CtrCommonAccessor::mf_size() {
// pull value
size_t CtrCommonAccessor::select_dim() {
auto embedx_dim = _config.embedx_dim();
return 1 + embedx_dim;
return 3 + embedx_dim;
}
size_t CtrCommonAccessor::select_dim_size(size_t dim) { return sizeof(float); }
......@@ -213,6 +213,10 @@ int32_t CtrCommonAccessor::select(float** select_values, const float** values,
for (size_t value_item = 0; value_item < num; ++value_item) {
float* select_value = select_values[value_item];
const float* value = values[value_item];
select_value[CtrCommonPullValue::show_index()] =
value[common_feature_value.show_index()];
select_value[CtrCommonPullValue::click_index()] =
value[common_feature_value.click_index()];
select_value[CtrCommonPullValue::embed_w_index()] =
value[common_feature_value.embed_w_index()];
memcpy(select_value + CtrCommonPullValue::embedx_w_index(),
......
......@@ -24,6 +24,7 @@
namespace paddle {
namespace distributed {
// DownpourUnitAccessor
class CtrCommonAccessor : public ValueAccessor {
public:
struct CtrCommonFeatureValue {
......@@ -106,15 +107,25 @@ class CtrCommonAccessor : public ValueAccessor {
struct CtrCommonPullValue {
/*
float show;
float click;
float embed_w;
std::vector<float> embedx_w;
*/
static int dim(int embedx_dim) { return 1 + embedx_dim; }
static int dim(int embedx_dim) { return 3 + embedx_dim; }
static int dim_size(size_t dim) { return sizeof(float); }
static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); }
static int embed_w_index() { return 0; }
static int embedx_w_index() { return 1; }
static int show_index() { return 0; }
static int click_index() { return 1; }
static int embed_w_index() { return 2; }
static int embedx_w_index() { return 3; }
static float& show(float* val) {
return val[CtrCommonPullValue::show_index()];
}
static float& click(float* val) {
return val[CtrCommonPullValue::click_index()];
}
static float& embed_w(float* val) {
return val[CtrCommonPullValue::embed_w_index()];
}
......
......@@ -196,26 +196,19 @@ class DAdamD2Sum : public DenseOptimizer {
for (int x = 0; x < static_cast<int>(names.size()); ++x) {
if (names[x] == "LearningRate") {
learning_rate = (*values)[x].data();
}
if (names[x] == "Param") {
} else if (names[x] == "Param") {
param = (*values)[x].data();
}
if (names[x] == "Moment") {
} else if (names[x] == "Moment") {
mom_velocity = (*values)[x].data();
}
if (names[x] == "G2Sum") {
} else if (names[x] == "G2Sum") {
ada_g2sum = (*values)[x].data();
}
if (names[x] == "D2Sum") {
} else if (names[x] == "D2Sum") {
ada_d2sum = (*values)[x].data();
}
if (names[x] == "MomentDecayRate") {
} else if (names[x] == "MomentDecayRate") {
mom_decay_rate = (*values)[x].data();
}
if (names[x] == "AdaDecayRate") {
} else if (names[x] == "AdaDecayRate") {
ada_decay_rate = (*values)[x].data();
}
if (names[x] == "AdaEpsilon") {
} else if (names[x] == "AdaEpsilon") {
ada_epsilon = (*values)[x].data();
}
}
......@@ -268,5 +261,34 @@ class DAdamD2Sum : public DenseOptimizer {
float* ada_epsilon;
};
// for data_norm
class DSummary : public DenseOptimizer {
public:
explicit DSummary(const CommonAccessorParameter& accessor,
std::vector<std::vector<float>>* values) {
auto& names = accessor.params();
for (int x = 0; x < static_cast<int>(names.size()); ++x) {
if (names[x] == "Param") {
param = (*values)[x].data();
} else if (names[x] == "SummaryDecayRate") {
summary_decay_rate = (*values)[x].data();
}
}
}
void update(const float* update_values, size_t num, int begin,
int end) override {
auto update_numel = end - begin;
Eigen::Map<Eigen::MatrixXf> mat_w(param + begin, 1, update_numel);
Eigen::Map<const Eigen::MatrixXf> mat_grad(update_values + begin, 1,
update_numel);
mat_w = mat_w * summary_decay_rate_d + mat_grad;
}
float* summary_decay_rate;
double summary_decay_rate_d = 0.999999;
float* param;
};
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
#include <gflags/gflags.h>
#include "glog/logging.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace distributed {
int SparseAccessor::initialize() {
auto name = _config.embed_sgd_param().name();
_embed_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
_embed_sgd_rule->load_config(_config.embed_sgd_param(), 1);
name = _config.embedx_sgd_param().name();
_embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
_embedx_sgd_rule->load_config(_config.embedx_sgd_param(),
_config.embedx_dim());
sparse_feature_value.embed_sgd_dim = _embed_sgd_rule->dim();
sparse_feature_value.embedx_dim = _config.embedx_dim();
sparse_feature_value.embedx_sgd_dim = _embedx_sgd_rule->dim();
_show_click_decay_rate = _config.ctr_accessor_param().show_click_decay_rate();
return 0;
}
void SparseAccessor::GetTableInfo(AccessorInfo& info) {
info.dim = dim();
info.size = size();
info.select_dim = select_dim();
info.select_size = select_size();
info.update_dim = update_dim();
info.update_size = update_size();
info.fea_dim = fea_dim();
}
size_t SparseAccessor::dim() { return sparse_feature_value.dim(); }
size_t SparseAccessor::dim_size(size_t dim) {
auto embedx_dim = _config.embedx_dim();
return sparse_feature_value.dim_size(dim, embedx_dim);
}
size_t SparseAccessor::size() { return sparse_feature_value.size(); }
size_t SparseAccessor::mf_size() {
return (_config.embedx_dim() + sparse_feature_value.embedx_sgd_dim) *
sizeof(float); // embedx embedx_g2sum
}
// pull value
size_t SparseAccessor::select_dim() {
auto embedx_dim = _config.embedx_dim();
return 1 + embedx_dim;
}
size_t SparseAccessor::select_dim_size(size_t dim) { return sizeof(float); }
size_t SparseAccessor::select_size() { return select_dim() * sizeof(float); }
// push value
size_t SparseAccessor::update_dim() {
auto embedx_dim = _config.embedx_dim();
return 4 + embedx_dim;
}
size_t SparseAccessor::update_dim_size(size_t dim) { return sizeof(float); }
size_t SparseAccessor::update_size() { return update_dim() * sizeof(float); }
bool SparseAccessor::shrink(float* value) {
auto base_threshold = _config.ctr_accessor_param().base_threshold();
auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
auto delete_after_unseen_days =
_config.ctr_accessor_param().delete_after_unseen_days();
auto delete_threshold = _config.ctr_accessor_param().delete_threshold();
// time_decay first
sparse_feature_value.show(value) *= _show_click_decay_rate;
sparse_feature_value.click(value) *= _show_click_decay_rate;
// shrink after
auto score = show_click_score(sparse_feature_value.show(value),
sparse_feature_value.click(value));
auto unseen_days = sparse_feature_value.unseen_days(value);
if (score < delete_threshold || unseen_days > delete_after_unseen_days) {
return true;
}
return false;
}
bool SparseAccessor::save(float* value, int param) {
auto base_threshold = _config.ctr_accessor_param().base_threshold();
auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
if (param == 2) {
delta_threshold = 0;
}
switch (param) {
// save all
case 0: {
return true;
}
// save xbox delta
case 1:
// save xbox base
case 2: {
if (show_click_score(sparse_feature_value.show(value),
sparse_feature_value.click(value)) >=
base_threshold &&
sparse_feature_value.delta_score(value) >= delta_threshold &&
sparse_feature_value.unseen_days(value) <= delta_keep_days) {
// do this after save, because it must not be modified when retry
if (param == 2) {
sparse_feature_value.delta_score(value) = 0;
}
return true;
} else {
return false;
}
}
// already decayed in shrink
case 3: {
// do this after save, because it must not be modified when retry
// sparse_feature_value.unseen_days(value)++;
return true;
}
// save revert batch_model
case 5: {
return true;
}
default:
return true;
}
}
void SparseAccessor::update_stat_after_save(float* value, int param) {
auto base_threshold = _config.ctr_accessor_param().base_threshold();
auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
if (param == 2) {
delta_threshold = 0;
}
switch (param) {
case 1: {
if (show_click_score(sparse_feature_value.show(value),
sparse_feature_value.click(value)) >=
base_threshold &&
sparse_feature_value.delta_score(value) >= delta_threshold &&
sparse_feature_value.unseen_days(value) <= delta_keep_days) {
sparse_feature_value.delta_score(value) = 0;
}
}
return;
case 3: {
sparse_feature_value.unseen_days(value)++;
}
return;
default:
return;
}
}
int32_t SparseAccessor::create(float** values, size_t num) {
auto embedx_dim = _config.embedx_dim();
for (size_t value_item = 0; value_item < num; ++value_item) {
float* value = values[value_item];
value[sparse_feature_value.unseen_days_index()] = 0;
value[sparse_feature_value.delta_score_index()] = 0;
value[sparse_feature_value.show_index()] = 0;
value[sparse_feature_value.click_index()] = 0;
value[sparse_feature_value.slot_index()] = -1;
_embed_sgd_rule->init_value(
value + sparse_feature_value.embed_w_index(),
value + sparse_feature_value.embed_g2sum_index());
_embedx_sgd_rule->init_value(
value + sparse_feature_value.embedx_w_index(),
value + sparse_feature_value.embedx_g2sum_index(), false);
}
return 0;
}
bool SparseAccessor::need_extend_mf(float* value) {
float show = value[sparse_feature_value.show_index()];
float click = value[sparse_feature_value.click_index()];
float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() +
click * _config.ctr_accessor_param().click_coeff();
return score >= _config.embedx_threshold();
}
bool SparseAccessor::has_mf(size_t size) {
return size > sparse_feature_value.embedx_g2sum_index();
}
// from SparseFeatureValue to SparsePullValue
int32_t SparseAccessor::select(float** select_values, const float** values,
size_t num) {
auto embedx_dim = _config.embedx_dim();
for (size_t value_item = 0; value_item < num; ++value_item) {
float* select_value = select_values[value_item];
const float* value = values[value_item];
select_value[SparsePullValue::embed_w_index()] =
value[sparse_feature_value.embed_w_index()];
memcpy(select_value + SparsePullValue::embedx_w_index(),
value + sparse_feature_value.embedx_w_index(),
embedx_dim * sizeof(float));
}
return 0;
}
// from SparsePushValue to SparsePushValue
// first dim: item
// second dim: field num
int32_t SparseAccessor::merge(float** update_values,
const float** other_update_values, size_t num) {
auto embedx_dim = _config.embedx_dim();
size_t total_dim = SparsePushValue::dim(embedx_dim);
for (size_t value_item = 0; value_item < num; ++value_item) {
float* update_value = update_values[value_item];
const float* other_update_value = other_update_values[value_item];
for (auto i = 0u; i < total_dim; ++i) {
if (i != SparsePushValue::slot_index()) {
update_value[i] += other_update_value[i];
}
}
}
return 0;
}
// from SparsePushValue to SparseFeatureValue
// first dim: item
// second dim: field num
int32_t SparseAccessor::update(float** update_values, const float** push_values,
size_t num) {
auto embedx_dim = _config.embedx_dim();
for (size_t value_item = 0; value_item < num; ++value_item) {
float* update_value = update_values[value_item];
const float* push_value = push_values[value_item];
float push_show = push_value[SparsePushValue::show_index()];
float push_click = push_value[SparsePushValue::click_index()];
float slot = push_value[SparsePushValue::slot_index()];
update_value[sparse_feature_value.show_index()] += push_show;
update_value[sparse_feature_value.click_index()] += push_click;
update_value[sparse_feature_value.slot_index()] = slot;
update_value[sparse_feature_value.delta_score_index()] +=
(push_show - push_click) * _config.ctr_accessor_param().nonclk_coeff() +
push_click * _config.ctr_accessor_param().click_coeff();
update_value[sparse_feature_value.unseen_days_index()] = 0;
_embed_sgd_rule->update_value(
update_value + sparse_feature_value.embed_w_index(),
update_value + sparse_feature_value.embed_g2sum_index(),
push_value + SparsePushValue::embed_g_index());
_embedx_sgd_rule->update_value(
update_value + sparse_feature_value.embedx_w_index(),
update_value + sparse_feature_value.embedx_g2sum_index(),
push_value + SparsePushValue::embedx_g_index());
}
return 0;
}
bool SparseAccessor::create_value(int stage, const float* value) {
// stage == 0, pull
// stage == 1, push
if (stage == 0) {
return true;
} else if (stage == 1) {
// operation
auto show = SparsePushValue::show(const_cast<float*>(value));
auto click = SparsePushValue::click(const_cast<float*>(value));
auto score = show_click_score(show, click);
if (score <= 0) {
return false;
}
if (score >= 1) {
return true;
}
return local_uniform_real_distribution<float>()(local_random_engine()) <
score;
} else {
return true;
}
}
float SparseAccessor::show_click_score(float show, float click) {
auto nonclk_coeff = _config.ctr_accessor_param().nonclk_coeff();
auto click_coeff = _config.ctr_accessor_param().click_coeff();
return (show - click) * nonclk_coeff + click * click_coeff;
}
std::string SparseAccessor::parse_to_string(const float* v, int param) {
thread_local std::ostringstream os;
os.clear();
os.str("");
os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4] << " "
<< v[5];
for (int i = sparse_feature_value.embed_g2sum_index();
i < sparse_feature_value.embedx_w_index(); i++) {
os << " " << v[i];
}
auto show = sparse_feature_value.show(const_cast<float*>(v));
auto click = sparse_feature_value.click(const_cast<float*>(v));
auto score = show_click_score(show, click);
if (score >= _config.embedx_threshold() &&
param > sparse_feature_value.embedx_w_index()) {
for (auto i = sparse_feature_value.embedx_w_index();
i < sparse_feature_value.dim(); ++i) {
os << " " << v[i];
}
}
return os.str();
}
int SparseAccessor::parse_from_string(const std::string& str, float* value) {
int embedx_dim = _config.embedx_dim();
_embedx_sgd_rule->init_value(
value + sparse_feature_value.embedx_w_index(),
value + sparse_feature_value.embedx_g2sum_index());
auto ret = paddle::string::str_to_float(str.data(), value);
CHECK(ret >= 6) << "expect more than 6 real:" << ret;
return ret;
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include "paddle/fluid/distributed/common/registerer.h"
#include "paddle/fluid/distributed/ps.pb.h"
#include "paddle/fluid/distributed/ps/table/accessor.h"
#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
namespace paddle {
namespace distributed {
// no show click, for word2vec(DownpourSparseValueAccessor)
class SparseAccessor : public ValueAccessor {
public:
struct SparseFeatureValue {
/*
float slot;
float unseen_days;
float delta_score;
float show;
float click;
float embed_w;
std::vector<float> embed_g2sum;
std::vector<float> embedx_w;
std::<vector>float embedx_g2sum;
*/
int dim() { return 6 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; }
int dim_size(size_t dim, int embedx_dim) { return sizeof(float); }
int size() { return dim() * sizeof(float); }
int slot_index() { return 0; }
int unseen_days_index() { return slot_index() + 1; }
int delta_score_index() { return unseen_days_index() + 1; }
int show_index() { return delta_score_index() + 1; }
int click_index() { return show_index() + 1; }
int embed_w_index() { return click_index() + 1; }
int embed_g2sum_index() { return embed_w_index() + 1; }
int embedx_w_index() { return embed_g2sum_index() + embed_sgd_dim; }
int embedx_g2sum_index() { return embedx_w_index() + embedx_dim; }
float& unseen_days(float* val) { return val[unseen_days_index()]; }
float& delta_score(float* val) { return val[delta_score_index()]; }
float& show(float* val) { return val[show_index()]; }
float& click(float* val) { return val[click_index()]; }
float& slot(float* val) { return val[slot_index()]; }
float& embed_w(float* val) { return val[embed_w_index()]; }
float& embed_g2sum(float* val) { return val[embed_g2sum_index()]; }
float& embedx_w(float* val) { return val[embedx_w_index()]; }
float& embedx_g2sum(float* val) { return val[embedx_g2sum_index()]; }
int embed_sgd_dim;
int embedx_dim;
int embedx_sgd_dim;
};
struct SparsePushValue {
/*
float slot;
float show;
float click;
float embed_g;
std::vector<float> embedx_g;
*/
static int dim(int embedx_dim) { return 4 + embedx_dim; }
static int dim_size(int dim, int embedx_dim) { return sizeof(float); }
static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); }
static int slot_index() { return 0; }
static int show_index() { return SparsePushValue::slot_index() + 1; }
static int click_index() { return SparsePushValue::show_index() + 1; }
static int embed_g_index() { return SparsePushValue::click_index() + 1; }
static int embedx_g_index() { return SparsePushValue::embed_g_index() + 1; }
static float& slot(float* val) {
return val[SparsePushValue::slot_index()];
}
static float& show(float* val) {
return val[SparsePushValue::show_index()];
}
static float& click(float* val) {
return val[SparsePushValue::click_index()];
}
static float& embed_g(float* val) {
return val[SparsePushValue::embed_g_index()];
}
static float* embedx_g(float* val) {
return val + SparsePushValue::embedx_g_index();
}
};
struct SparsePullValue {
/*
float embed_w;
std::vector<float> embedx_w;
*/
static int dim(int embedx_dim) { return 1 + embedx_dim; }
static int dim_size(size_t dim) { return sizeof(float); }
static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); }
static int embed_w_index() { return 0; }
static int embedx_w_index() { return 1; }
static float& embed_w(float* val) {
return val[SparsePullValue::embed_w_index()];
}
static float* embedx_w(float* val) {
return val + SparsePullValue::embedx_w_index();
}
};
SparseAccessor() {}
virtual int initialize();
virtual void GetTableInfo(AccessorInfo& info);
virtual ~SparseAccessor() {}
// value维度
virtual size_t dim();
// value各个维度的size
virtual size_t dim_size(size_t dim);
// value各维度相加总size
virtual size_t size();
// value中mf动态长度部分总size大小, sparse下生效
virtual size_t mf_size();
// pull value维度
virtual size_t select_dim();
// pull value各个维度的size
virtual size_t select_dim_size(size_t dim);
// pull value各维度相加总size
virtual size_t select_size();
// push value维度
virtual size_t update_dim();
// push value各个维度的size
virtual size_t update_dim_size(size_t dim);
// push value各维度相加总size
virtual size_t update_size();
// 判断该value是否进行shrink
virtual bool shrink(float* value);
// 判断该value是否保存到ssd
// virtual bool save_ssd(float* value);
virtual bool need_extend_mf(float* value);
virtual bool has_mf(size_t size);
// 判断该value是否在save阶段dump,
// param作为参数用于标识save阶段,如downpour的xbox与batch_model
// param = 0, save all feature
// param = 1, save delta feature
// param = 2, save xbox base feature
bool save(float* value, int param) override;
// update delta_score and unseen_days after save
void update_stat_after_save(float* value, int param) override;
// keys不存在时,为values生成随机值
// 要求value的内存由外部调用者分配完毕
virtual int32_t create(float** value, size_t num);
// 从values中选取到select_values中
virtual int32_t select(float** select_values, const float** values,
size_t num);
// 将update_values聚合到一起
virtual int32_t merge(float** update_values,
const float** other_update_values, size_t num);
// 将update_values聚合到一起,通过it.next判定是否进入下一个key
// virtual int32_t merge(float** update_values, iterator it);
// 将update_values更新应用到values中
virtual int32_t update(float** values, const float** update_values,
size_t num);
std::string parse_to_string(const float* value, int param) override;
int32_t parse_from_string(const std::string& str, float* v) override;
virtual bool create_value(int type, const float* value);
// 这个接口目前只用来取show
float get_field(float* value, const std::string& name) override {
// CHECK(name == "show");
if (name == "show") {
return sparse_feature_value.show(value);
}
return 0.0;
}
private:
// float show_click_score(float show, float click);
// SparseValueSGDRule* _embed_sgd_rule;
// SparseValueSGDRule* _embedx_sgd_rule;
// SparseFeatureValue sparse_feature_value;
float _show_click_decay_rate;
int32_t _ssd_unseenday_threshold;
public: // TODO(zhaocaibei123): it should be private, but we make it public
// for unit test
SparseFeatureValue sparse_feature_value;
float show_click_score(float show, float click);
SparseValueSGDRule* _embed_sgd_rule;
SparseValueSGDRule* _embedx_sgd_rule;
};
} // namespace distributed
} // namespace paddle
......@@ -27,6 +27,7 @@
#endif
#include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
#include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
#include "paddle/fluid/distributed/ps/table/tensor_accessor.h"
#include "paddle/fluid/distributed/ps/table/tensor_table.h"
......@@ -49,6 +50,7 @@ REGISTER_PSCORE_CLASS(Table, MemorySparseTable);
REGISTER_PSCORE_CLASS(Table, MemorySparseGeoTable);
REGISTER_PSCORE_CLASS(ValueAccessor, CommMergeAccessor);
REGISTER_PSCORE_CLASS(ValueAccessor, CtrCommonAccessor);
REGISTER_PSCORE_CLASS(ValueAccessor, SparseAccessor);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule);
......
......@@ -71,11 +71,22 @@ class FleetWrapper : public PSWrapper {
}
virtual int32_t Initialize(InitContext& context) { return 0; }
// TODO(zhaocaibei123: later)
int32_t CopyTable(const uint64_t src_table_id, const uint64_t dest_table_id);
int32_t CopyTableByFeasign(const uint64_t src_table_id,
const uint64_t dest_table_id,
const std::vector<uint64_t>& feasign_list);
typedef std::function<void(int, int)> HeterCallBackFunc;
int RegisterHeterCallback(HeterCallBackFunc handler);
virtual void Stop() override;
virtual void Load(WrapperContext& context) override;
virtual void Save(WrapperContext& context) override;
// set client to client communication config
void SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms,
int max_retry);
......@@ -168,7 +179,8 @@ class FleetWrapper : public PSWrapper {
std::vector<const LoDTensor*>* inputs,
const LoDTensor* shows,
const LoDTensor* clicks,
std::vector<LoDTensor*>* outputs);
std::vector<LoDTensor*>* outputs,
bool use_cvm_op = false);
// Push sparse variables to server in Async mode
// Param<In>: scope, table_id, fea_keys, sparse_grad_names
// Param<Out>: push_values, push_sparse_status
......@@ -185,12 +197,7 @@ class FleetWrapper : public PSWrapper {
const std::vector<framework::ProgramDesc>& server_sub_program = {});
// init trainer
void InitWorker(const std::string& dist_desc,
const std::vector<std::string>& host_sign_list, Scope* scope,
const RpcCtxMap& send_ctx,
const std::unordered_map<uint64_t, std::vector<std::string>>&
dense_varnames,
const std::map<std::string, std::string>& envs, int node_num,
int index);
const std::vector<std::string>& host_sign_list, int index);
// stop server
void StopServer();
......@@ -200,6 +207,8 @@ class FleetWrapper : public PSWrapper {
uint64_t RunServer(const std::string& ip, uint32_t port);
// get client info
std::vector<uint64_t> GetClientsInfo();
// set client info
int SetClients(std::vector<uint64_t>& host_sign_list); // NOLINT
// create client to client connection
void CreateClient2ClientConnection();
// flush all push requests
......@@ -255,10 +264,15 @@ class FleetWrapper : public PSWrapper {
// this performs better than rand_r, especially large data
std::default_random_engine& LocalRandomEngine();
// for init worker
void InitGFlag(const std::string& gflags);
static std::shared_ptr<paddle::distributed::PSCore> pserver_ptr_;
static std::shared_ptr<paddle::distributed::PSClient> worker_ptr_;
private:
static std::shared_ptr<FleetWrapper> s_instance_;
paddle::distributed::PaddlePSEnvironment ps_env_;
size_t GetAbsoluteSum(size_t start, size_t end, size_t level,
const framework::LoD& lod);
......
......@@ -74,7 +74,7 @@ TEST(MemorySparseTable, SGD) {
std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
std::vector<float> init_values;
init_values.resize(init_keys.size() * (emb_dim + 1));
init_values.resize(init_keys.size() * (emb_dim + 3));
auto value = PullSparseValue(init_keys, init_fres, emb_dim);
table->pull_sparse(init_values.data(), value);
......@@ -119,11 +119,11 @@ TEST(MemorySparseTable, SGD) {
}
std::vector<float> pull_values;
pull_values.resize(init_keys.size() * (emb_dim + 1));
pull_values.resize(init_keys.size() * (emb_dim + 3));
table->pull_sparse(pull_values.data(), value);
for (size_t i = 0; i < init_keys.size(); ++i) {
for (size_t j = 0; j < emb_dim + 1; ++j) {
for (size_t j = 2; j < emb_dim + 3; ++j) {
auto update_val = init_values[i * (emb_dim + 1) + j] -
0.1 * total_gradients[3 + i * (emb_dim + 4) + j];
VLOG(3) << total_gradients[i * (emb_dim + 4) + j + 3] << ":"
......
......@@ -24,6 +24,9 @@
#include "paddle/fluid/eager/utils.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
namespace egr {
......
......@@ -235,7 +235,7 @@ if(WITH_PYTHON)
py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto)
py_proto_compile(ps_py_proto SRCS ps.proto)
py_proto_compile(ps_py_proto SRCS the_one_ps.proto)
#Generate an empty \
#__init__.py to make framework_py_proto as a valid python module.
add_custom_target(fleet_proto_init ALL
......@@ -249,7 +249,7 @@ if(WITH_PYTHON)
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMAND cp ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMAND cp the_one_ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto
......@@ -261,7 +261,7 @@ if(WITH_PYTHON)
add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND copy /Y *.py ${proto_dstpath}
COMMAND copy /Y ps_pb2.py ${fleet_proto_dstpath}
COMMAND copy /Y the_one_ps_pb2.py ${fleet_proto_dstpath}
COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
......@@ -314,7 +314,7 @@ if(WITH_DISTRIBUTE)
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc heter_pipeline_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
downpour_worker.cc downpour_worker_opt.cc
downpour_worker.cc downpour_lite_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
index_sampler index_wrapper sampler index_dataset_proto
......@@ -329,6 +329,7 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(multi_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(hogwild_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(downpour_lite_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(heter_section_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(heter_pipeline_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else()
......
......@@ -27,6 +27,10 @@ limitations under the License. */
#include <utility> // NOLINT
#include <vector>
#if defined(PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#endif
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/heter_util.h"
......@@ -107,7 +111,12 @@ class PullDenseWorker {
bool CheckUpdateParam(uint64_t table_id);
private:
#if defined(PADDLE_WITH_PSCORE)
std::shared_ptr<paddle::distributed::FleetWrapper> fleet_ptr_;
#else
std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
#endif
PullDenseWorkerParameter param_;
DownpourWorkerParameter dwp_param_;
Scope* root_scope_;
......@@ -341,6 +350,79 @@ class DownpourWorker : public HogwildWorker {
// std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
};
// Based on DownpourWorker,remove push pull code into operator
#if defined(PADDLE_WITH_PSCORE)
class DownpourLiteWorker : public HogwildWorker {
public:
DownpourLiteWorker() {}
virtual ~DownpourLiteWorker() {}
virtual void Initialize(const TrainerDesc& desc);
virtual void TrainFiles();
virtual void TrainFilesWithProfiler();
protected:
std::shared_ptr<paddle::distributed::FleetWrapper> fleet_ptr_;
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
void PushGradients();
void CopySparseTable();
void CopyDenseTable();
void CopyDenseVars();
DownpourWorkerParameter param_;
// copy table
CopyTableConfig copy_table_config_;
std::vector<std::pair<uint64_t, uint64_t>> copy_sparse_tables_;
std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
// actually pushed feasign of each table
std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
std::map<uint64_t, std::vector<std::string>> sparse_key_names_;
// feasign
std::map<uint64_t, std::vector<uint64_t>> features_;
// feasign embedding
std::map<uint64_t, std::vector<std::vector<float>>> feature_values_;
std::map<uint64_t, std::vector<std::string>> sparse_value_names_;
// adjust ins weight
AdjustInsWeightConfig adjust_ins_weight_config_;
// check nan and inf during training
std::vector<std::string> check_nan_var_names_;
bool need_to_push_sparse_;
// feasign stats
std::map<uint64_t, std::vector<float>> feature_labels_;
std::map<uint64_t, std::vector<std::string>> sparse_grad_names_;
// feasign embedding gradient
std::map<uint64_t, std::vector<std::vector<float>>> feature_grads_;
std::vector<::std::future<int32_t>> push_sparse_status_;
bool dump_slot_;
bool need_to_push_dense_;
std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
float scale_datanorm_;
std::vector<::std::future<int32_t>> push_dense_status_;
// skipped ops
std::vector<std::string> skip_ops_;
// just save the value in param_ for easy access
std::map<uint64_t, std::string> label_var_name_;
std::map<uint64_t, std::vector<std::string>> dense_value_names_;
std::map<uint64_t, uint64_t> table_dependency_;
std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
// multitask
std::map<int32_t, uint64_t> cond2table_map_;
std::set<uint64_t> condvalue_set_;
bool flag_partial_push_;
private:
// std::vector<std::string> dump_param_;
// just save the value in param_ for easy access
// std::map<uint64_t, std::string> label_var_name_;
// std::map<uint64_t, std::vector<std::string>> dense_value_names_;
std::shared_ptr<PullDenseWorker> _pull_dense_worker;
std::vector<float> nid_show_;
// std::map<uint64_t, uint64_t> table_dependency_;
// std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
};
#endif
class DownpourWorkerOpt : public DownpourWorker {
public:
DownpourWorkerOpt() {}
......
......@@ -67,6 +67,7 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
#if defined(PADDLE_WITH_PSCORE)
REGISTER_DEVICE_WORKER_CLASS(DownpourLiteWorker);
REGISTER_DEVICE_WORKER_CLASS(HeterSectionWorker);
#endif
......
......@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#endif
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h"
......@@ -62,7 +66,11 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
}
void DistMultiTrainer::RegisterHeterCallback() {
#ifdef PADDLE_WITH_PSCORE
auto fleet_ptr = paddle::distributed::FleetWrapper::GetInstance();
#else
auto fleet_ptr = FleetWrapper::GetInstance();
#endif
fleet_ptr->RegisterHeterCallback(
[this](int worker, int taskid) { workers_[worker]->Schedule(taskid); });
}
......@@ -93,7 +101,7 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program,
workers_[i]->SetRootScope(root_scope_);
workers_[i]->CreateDeviceResource(main_program); // Program
workers_[i]->BindingDataFeedMemory();
#ifdef PADDLE_WITH_PSLIB
#if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
workers_[i]->CacheProgram(main_program);
#endif
}
......@@ -110,7 +118,7 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) {
}
pull_dense_worker_->SetRootScope(root_scope_);
pull_dense_worker_->Start();
#ifdef PADDLE_WITH_PSLIB
#if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
for (int i = 0; i < thread_num_; ++i) {
workers_[i]->GetXpuOpIndex();
}
......@@ -176,8 +184,12 @@ void DistMultiTrainer::Finalize() {
pull_dense_worker_->Stop();
root_scope_->DropKids();
// flush local client push queue
// flush local client push queue
#ifdef PADDLE_WITH_PSCORE
auto fleet_ptr_ = paddle::distributed::FleetWrapper::GetInstance();
#else
auto fleet_ptr_ = FleetWrapper::GetInstance();
#endif
fleet_ptr_->ClientFlush();
}
......
此差异已折叠。
......@@ -19,7 +19,7 @@
#include <numeric>
#include "paddle/fluid/framework/lod_tensor.h"
#if defined(PADDLE_WITH_PSLIB)
#if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
namespace paddle {
namespace framework {
......
......@@ -38,7 +38,7 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#if defined(PADDLE_WITH_PSLIB)
#if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
namespace paddle {
namespace framework {
......
......@@ -61,7 +61,13 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
last_versions_[tid] = 0;
current_version_[tid] = 0;
}
#if defined(PADDLE_WITH_PSCORE)
fleet_ptr_ = paddle::distributed::FleetWrapper::GetInstance();
#else
fleet_ptr_ = FleetWrapper::GetInstance();
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
copy_streams_.clear();
#endif
......@@ -170,6 +176,9 @@ void PullDenseWorker::PullDense(bool force_update) {
VLOG(3) << "pull dense " << force_update << " " << tid;
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, false);
#elif defined(PADDLE_WITH_PSCORE)
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, true);
#else
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, true);
......
......@@ -30,6 +30,21 @@ namespace operators {
class AbsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_MKLDNN
if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN);
}
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -72,8 +87,17 @@ class AbsGradOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
return framework::OpKernelType(dtype, ctx.GetPlace());
auto input_data_type =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_MKLDNN
if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN);
}
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
......
......@@ -390,6 +390,204 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
}
}
};
template <typename T>
class NPUConv3dKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* input = ctx.Input<Tensor>("Input");
const Tensor* filter = ctx.Input<Tensor>("Filter");
Tensor* output = ctx.Output<Tensor>("Output");
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
PADDLE_ENFORCE_EQ(data_format, "NCDHW",
platform::errors::Unimplemented(
"the data_format must be NCDHW in "
"the npu kernel of conv3d, but got data_format "
"= [%s]",
data_format));
PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented(
"the groups must be 1 in "
"the npu kernel of conv3d, but got groups "
"= [%d]",
groups));
output->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
auto input_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(input->dims(), dev_ctx);
auto filter_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter->dims(), dev_ctx);
auto output_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(output->dims(), dev_ctx);
input_tensor.ShareDataWith(*input);
filter_tensor.ShareDataWith(*filter);
output_tensor.ShareDataWith(*output);
input_tensor.set_layout(DataLayout::kNCDHW);
filter_tensor.set_layout(DataLayout::kNCDHW);
output_tensor.set_layout(DataLayout::kNCDHW);
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter->dims();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
std::vector<int> strides_vec(5, 1);
std::vector<int> dilations_vec(5, 1);
strides_vec[2] = strides[0];
strides_vec[3] = strides[1];
strides_vec[4] = strides[2];
dilations_vec[2] = dilations[0];
dilations_vec[3] = dilations[1];
dilations_vec[4] = dilations[2];
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
const auto& runner =
NpuOpRunner("Conv3D", {input_tensor, filter_tensor}, {output_tensor},
{{"strides", strides_vec},
{"pads", paddings},
{"dilations", dilations_vec},
{"groups", groups},
{"data_format", data_format}});
runner.Run(stream);
}
};
template <typename T>
class NPUConv3dGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* input = ctx.Input<Tensor>("Input");
const Tensor* filter = ctx.Input<Tensor>("Filter");
const Tensor* output_grad =
ctx.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
PADDLE_ENFORCE_EQ(data_format, "NCDHW",
platform::errors::Unimplemented(
"the data_format must be NCDHW in "
"the npu kernel of conv3d, but got data_format "
"= [%s]",
data_format));
PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented(
"the groups must be 1 in "
"the npu kernel of conv3d, but got groups "
"= [%d]",
groups));
auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
auto input_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(input->dims(), dev_ctx);
auto filter_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter->dims(), dev_ctx);
auto output_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
output_grad->dims(), dev_ctx);
input_tensor.ShareDataWith(*input);
filter_tensor.ShareDataWith(*filter);
output_grad_tensor.ShareDataWith(*output_grad);
input_tensor.set_layout(DataLayout::kNCDHW);
filter_tensor.set_layout(DataLayout::kNCDHW);
output_grad_tensor.set_layout(DataLayout::kNCDHW);
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter->dims();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
std::vector<int> strides_vec(5, 1);
std::vector<int> dilations_vec(5, 1);
strides_vec[2] = strides[0];
strides_vec[3] = strides[1];
strides_vec[4] = strides[2];
dilations_vec[2] = dilations[0];
dilations_vec[3] = dilations[1];
dilations_vec[4] = dilations[2];
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
Tensor filter_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
filter_grad->dims(), dev_ctx);
filter_grad_tensor.ShareDataWith(*filter_grad);
filter_grad_tensor.set_layout(DataLayout::kNCDHW);
const auto& runner = NpuOpRunner(
"Conv3DBackpropFilterD", {input_tensor, output_grad_tensor},
{filter_grad_tensor}, {{"filter_size", filter_shape_vec},
{"strides", strides_vec},
{"pads", paddings},
{"dilations", dilations_vec},
{"groups", groups},
{"data_format", data_format}});
runner.Run(stream);
}
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());
Tensor input_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
input_grad->dims(), dev_ctx);
input_grad_tensor.ShareDataWith(*input_grad);
input_grad_tensor.set_layout(DataLayout::kNCDHW);
const auto& runner = NpuOpRunner(
"Conv3DBackpropInputD", {filter_tensor, output_grad_tensor},
{input_grad_tensor}, {{"input_size", input_shape_vec},
{"strides", strides_vec},
{"pads", paddings},
{"dilations", dilations_vec},
{"groups", groups},
{"data_format", data_format}});
runner.Run(stream);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -408,3 +606,9 @@ REGISTER_OP_NPU_KERNEL(conv2d, ops::NPUConvOpKernel<float>,
REGISTER_OP_NPU_KERNEL(conv2d_grad, ops::NPUConvGradOpKernel<float>,
ops::NPUConvGradOpKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL(conv3d, ops::NPUConv3dKernel<float>,
ops::NPUConv3dKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL(conv3d_grad, ops::NPUConv3dGradKernel<float>,
ops::NPUConv3dGradKernel<plat::float16>);
......@@ -315,15 +315,7 @@ using ExpMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
namespace ops = paddle::operators;
#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationKernel<ops::functor<float>>); \
REGISTER_OP_KERNEL( \
act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
#define REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(act_type, functor, \
grad_functor) \
#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_KERNEL( \
act_type, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationKernel<ops::functor<float>>, \
......@@ -339,30 +331,27 @@ namespace ops = paddle::operators;
ops::MKLDNNActivationKernel<ops::functor<float>>);
#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \
__macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \
__macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
__macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor); \
__macro(hard_swish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \
__macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradUseOutFunctor); \
__macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); \
__macro(elu, EluMKLDNNFunctor, EluMKLDNNGradUseOutFunctor); \
__macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor);
__macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor); \
__macro(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); \
__macro(hard_swish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \
__macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
__macro(mish, MishMKLDNNFunctor, MishMKLDNNGradFunctor); \
__macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
__macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \
__macro(sigmoid, SigmoidMKLDNNFunctor, SigmoidMKLDNNGradUseOutFunctor); \
__macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradUseOutFunctor); \
__macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor); \
__macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradUseOutFunctor);
FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(round, RoundMKLDNNFunctor);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor,
ReluMKLDNNGradFunctor);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor,
GeluMKLDNNGradFunctor);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor,
SigmoidMKLDNNGradUseOutFunctor);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sqrt, SqrtMKLDNNFunctor,
SqrtMKLDNNGradUseOutFunctor);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(mish, MishMKLDNNFunctor,
MishMKLDNNGradFunctor);
REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(round, RoundMKLDNNFunctor);
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(
softplus, MKLDNN, paddle::platform::CPUPlace,
ops::MKLDNNActivationKernel<ops::SoftplusMKLDNNFunctor<float>>);
ops::MKLDNNActivationKernel<ops::SoftplusMKLDNNFunctor<float>>,
ops::MKLDNNActivationKernel<
ops::SoftplusMKLDNNFunctor<paddle::platform::bfloat16>>);
......@@ -13,7 +13,6 @@
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
......@@ -52,15 +51,13 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
auto inputs = context.MultiInput<framework::LoDTensor>("Ids");
auto outputs = context.MultiOutput<framework::LoDTensor>("Outputs");
// auto fleet = distributed::FleetWrapper::GetInstance();
auto *communicator = (distributed::AsyncCommunicator *)
distributed::Communicator::GetInstance();
auto fleet = distributed::FleetWrapper::GetInstance();
if (platform::is_cpu_place(context.GetPlace())) {
communicator->PullSparseToTensorSync(
static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(padding_idx), context.GetPlace(), !is_test,
&inputs, &outputs);
fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(padding_idx),
context.GetPlace(), !is_test, &inputs,
&outputs);
} else {
auto inputs_variable = context.MultiInputVar("Ids");
auto outputs_variable = context.MultiOutputVar("Outputs");
......@@ -96,10 +93,10 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
}
// use fleet->PullSparse
communicator->PullSparseToTensorSync(
static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(padding_idx), cpu_place, !is_test,
&tmp_input_vec, &tmp_output_vec);
fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(padding_idx),
cpu_place, !is_test, &tmp_input_vec,
&tmp_output_vec);
// cp temp to origin
for (size_t idx = 0; idx < output_var_size; ++idx) {
......
......@@ -106,6 +106,9 @@ class DistributedPushSparseOpMaker : public framework::OpProtoAndCheckerMaker {
"for training.")
.SetDefault(false);
AddAttr<bool>("use_cvm_op", "(boolean, default false) Use cvm op or not.")
.SetDefault(false);
AddComment(R"DOC(
Lookup Tablel Prefetch Operator.
This operator is used to perform lookup on parameter W,
......
......@@ -13,7 +13,6 @@
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
......@@ -32,22 +31,20 @@ class DistributedPushSparseKernel : public framework::OpKernel<T> {
auto padding_idx = context.Attr<int64_t>("padding_idx");
auto table_id = context.Attr<int>("table_id");
auto emb_dim = context.Attr<int>("size");
VLOG(1) << "push_sparse.h::emb_dim: " << emb_dim;
auto use_cvm_op = context.Attr<bool>("use_cvm_op");
auto inputs = context.MultiInput<framework::LoDTensor>("Ids");
auto shows = context.Input<framework::LoDTensor>("Shows");
auto clks = context.Input<framework::LoDTensor>("Clicks");
auto outputs = context.MultiOutput<framework::LoDTensor>("Outputs");
// auto fleet = distributed::FleetWrapper::GetInstance();
auto *communicator = (distributed::AsyncCommunicator *)
distributed::Communicator::GetInstance();
auto fleet = distributed::FleetWrapper::GetInstance();
if (platform::is_cpu_place(context.GetPlace())) {
communicator->PushSparseFromTensorAsync(
static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(padding_idx), context.GetPlace(), &inputs,
shows, clks, &outputs);
fleet->PushSparseFromTensorAsync(static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(padding_idx),
context.GetPlace(), &inputs, shows, clks,
&outputs, use_cvm_op);
} else {
auto inputs_variable = context.MultiInputVar("Ids");
auto outputs_variable = context.MultiOutputVar("Outputs");
......@@ -94,7 +91,7 @@ class DistributedPushSparseKernel : public framework::OpKernel<T> {
}
// use fleet->PullSparse
communicator->PushSparseFromTensorAsync(
fleet->PushSparseFromTensorAsync(
static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(padding_idx), context.GetPlace(),
&tmp_input_vec, tmp_shows_tensor, tmp_clicks_tensor, &tmp_output_vec);
......
......@@ -53,7 +53,7 @@ class SendOp : public framework::OperatorBase {
send_varnames[0] != "@PS_STEP_COUNTER@") {
auto fleet = paddle::distributed::FleetWrapper::GetInstance();
std::vector<::std::future<int32_t>> status;
fleet->PushDenseVarsAsync(scope, table_id, ins, &status, 0, -1);
fleet->PushDenseVarsAsync(scope, table_id, ins, &status, -1, -1);
} else {
auto* communicator = paddle::distributed::Communicator::GetInstance();
if (communicator->Check(send_varnames)) {
......
......@@ -47,6 +47,8 @@ static std::map<framework::proto::VarType::Type, aclDataType>
static std::map<DataLayout, aclFormat> DATA_LAYOUT_2_ACL_FORMAT = {
{DataLayout::kNCHW, ACL_FORMAT_NCHW},
{DataLayout::kNHWC, ACL_FORMAT_NHWC},
{DataLayout::kNCDHW, ACL_FORMAT_NCDHW},
{DataLayout::kNDHWC, ACL_FORMAT_NDHWC},
{DataLayout::kAnyLayout, ACL_FORMAT_ND},
};
......
......@@ -77,6 +77,8 @@ void BindDistFleetWrapper(py::module* m) {
.def("stop_worker", &FleetWrapper::FinalizeWorker)
.def("barrier", &FleetWrapper::BarrierWithTable)
.def("shrink_sparse_table", &FleetWrapper::ShrinkSparseTable)
.def("set_clients", &FleetWrapper::SetClients)
.def("get_client_info", &FleetWrapper::GetClientsInfo)
.def("create_client2client_connection",
&FleetWrapper::CreateClient2ClientConnection);
}
......
......@@ -30,6 +30,8 @@ enum class DataLayout {
SPARSE_COO,
SPARSE_CSR,
NUM_DATA_LAYOUTS,
NDHWC,
NCDHW,
// See Note [ Why we need ALL in basic kernel key member? ]
ALL_LAYOUT = UNDEFINED,
// Note: Unify phi DataLayout and fluid::framework::DataLayout,
......@@ -43,6 +45,8 @@ enum class DataLayout {
kNHWC = NHWC,
kNCHW = NCHW,
kMKLDNN = MKLDNN, // all layouts supported by MKLDNN internally
kNDHWC = NDHWC,
kNCDHW = NCDHW,
};
} // namespace experimental
......@@ -70,6 +74,10 @@ inline DataLayout StringToDataLayout(const std::string& str) {
return DataLayout::SPARSE_COO;
} else if (s == "SPARSE_CSR") {
return DataLayout::SPARSE_CSR;
} else if (s == "NDHWC") {
return DataLayout::kNDHWC;
} else if (s == "NCDHW") {
return DataLayout::kNCDHW;
} else {
PD_THROW("Unknown data layout type string: ", s, ".");
}
......@@ -89,6 +97,10 @@ inline std::string DataLayoutToString(const DataLayout& layout) {
return "SPARSE_COO";
case DataLayout::SPARSE_CSR:
return "SPARSE_CSR";
case DataLayout::kNDHWC:
return "NDHWC";
case DataLayout::kNCDHW:
return "NCDHW";
default:
PD_THROW("Unknown Data Layout type ", static_cast<int>(layout), ".");
}
......
......@@ -259,7 +259,7 @@ PD_REGISTER_KERNEL(multiply_triple_grad,
phi::dtype::bfloat16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(elementwise_fmax_grad,
PD_REGISTER_KERNEL(fmax_grad,
CPU,
ALL_LAYOUT,
phi::ElementwiseFMaxGradKernel,
......@@ -268,7 +268,7 @@ PD_REGISTER_KERNEL(elementwise_fmax_grad,
int,
int64_t) {}
PD_REGISTER_KERNEL(elementwise_fmin_grad,
PD_REGISTER_KERNEL(fmin_grad,
CPU,
ALL_LAYOUT,
phi::ElementwiseFMinGradKernel,
......
......@@ -87,23 +87,11 @@ using complex128 = ::phi::dtype::complex<double>;
// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
// using bfloat16 = ::phi::dtype::bfloat16;
PD_REGISTER_KERNEL(elementwise_fmax,
CPU,
ALL_LAYOUT,
phi::ElementwiseFMaxKernel,
float,
double,
int,
int64_t) {}
PD_REGISTER_KERNEL(
fmax, CPU, ALL_LAYOUT, phi::FMaxKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(elementwise_fmin,
CPU,
ALL_LAYOUT,
phi::ElementwiseFMinKernel,
float,
double,
int,
int64_t) {}
PD_REGISTER_KERNEL(
fmin, CPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(add_raw,
CPU,
......
......@@ -20,18 +20,18 @@
namespace phi {
template <typename T, typename Context>
void ElementwiseFMaxKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out);
void FMaxKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out);
template <typename T, typename Context>
void ElementwiseFMinKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out);
void FMinKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out);
template <typename T, typename Context>
void AddRawKernel(const Context& dev_ctx,
......
......@@ -282,7 +282,7 @@ PD_REGISTER_KERNEL(multiply_triple_grad,
phi::dtype::bfloat16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(elementwise_fmax_grad,
PD_REGISTER_KERNEL(fmax_grad,
GPU,
ALL_LAYOUT,
phi::ElementwiseFMaxGradKernel,
......@@ -291,7 +291,7 @@ PD_REGISTER_KERNEL(elementwise_fmax_grad,
int,
int64_t) {}
PD_REGISTER_KERNEL(elementwise_fmin_grad,
PD_REGISTER_KERNEL(fmin_grad,
GPU,
ALL_LAYOUT,
phi::ElementwiseFMinGradKernel,
......
......@@ -57,23 +57,11 @@ using bfloat16 = phi::dtype::bfloat16;
using complex64 = ::phi::dtype::complex<float>;
using complex128 = ::phi::dtype::complex<double>;
PD_REGISTER_KERNEL(elementwise_fmax,
GPU,
ALL_LAYOUT,
phi::ElementwiseFMaxKernel,
float,
double,
int,
int64_t) {}
PD_REGISTER_KERNEL(
fmax, GPU, ALL_LAYOUT, phi::FMaxKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(elementwise_fmin,
GPU,
ALL_LAYOUT,
phi::ElementwiseFMinKernel,
float,
double,
int,
int64_t) {}
PD_REGISTER_KERNEL(
fmin, GPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(add_raw,
GPU,
......
......@@ -23,22 +23,22 @@
namespace phi {
template <typename T, typename Context>
void ElementwiseFMaxKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out) {
void FMaxKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out) {
dev_ctx.template Alloc<T>(out);
funcs::ElementwiseCompute<funcs::FMaxFunctor<T>, T, T>(
dev_ctx, x, y, axis, funcs::FMaxFunctor<T>(), out);
}
template <typename T, typename Context>
void ElementwiseFMinKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out) {
void FMinKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out) {
dev_ctx.template Alloc<T>(out);
funcs::ElementwiseCompute<funcs::FMinFunctor<T>, T, T>(
dev_ctx, x, y, axis, funcs::FMinFunctor<T>(), out);
......
......@@ -19,25 +19,19 @@ namespace phi {
KernelSignature ElementwiseAddOpArgumentMapping(
const ArgumentMappingContext& ctx) {
int axis = paddle::any_cast<int>(ctx.Attr("axis"));
if (ctx.IsDenseTensorInput("X")) {
if (axis == -1) {
return KernelSignature("add", {"X", "Y"}, {}, {"Out"});
}
return KernelSignature("add_raw", {"X", "Y"}, {"axis"}, {"Out"});
if (axis == -1) {
return KernelSignature("add", {"X", "Y"}, {}, {"Out"});
}
return KernelSignature("unregistered", {}, {}, {});
return KernelSignature("add_raw", {"X", "Y"}, {"axis"}, {"Out"});
}
KernelSignature ElementwiseSubOpArgumentMapping(
const ArgumentMappingContext& ctx) {
int axis = paddle::any_cast<int>(ctx.Attr("axis"));
if (ctx.IsDenseTensorInput("X")) {
if (axis == -1) {
return KernelSignature("subtract", {"X", "Y"}, {}, {"Out"});
}
return KernelSignature("subtract_raw", {"X", "Y"}, {"axis"}, {"Out"});
if (axis == -1) {
return KernelSignature("subtract", {"X", "Y"}, {}, {"Out"});
}
return KernelSignature("unregistered", {}, {}, {});
return KernelSignature("subtract_raw", {"X", "Y"}, {"axis"}, {"Out"});
}
KernelSignature ElementwiseMulOpArgumentMapping(
......@@ -55,24 +49,18 @@ KernelSignature ElementwiseMulOpArgumentMapping(
KernelSignature ElementwiseDivOpArgumentMapping(
const ArgumentMappingContext& ctx) {
int axis = paddle::any_cast<int>(ctx.Attr("axis"));
if (ctx.IsDenseTensorInput("X")) {
if (axis == -1) {
return KernelSignature("divide", {"X", "Y"}, {}, {"Out"});
}
return KernelSignature("divide_raw", {"X", "Y"}, {"axis"}, {"Out"});
if (axis == -1) {
return KernelSignature("divide", {"X", "Y"}, {}, {"Out"});
}
return KernelSignature("unregistered", {}, {}, {});
return KernelSignature("divide_raw", {"X", "Y"}, {"axis"}, {"Out"});
}
KernelSignature ElementwiseAddGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("X")) {
return KernelSignature("add_grad",
{"X", "Y", GradVarName("Out")},
{"axis"},
{GradVarName("X"), GradVarName("Y")});
}
return KernelSignature("unregistered", {}, {}, {});
return KernelSignature("add_grad",
{"X", "Y", GradVarName("Out")},
{"axis"},
{GradVarName("X"), GradVarName("Y")});
}
KernelSignature ElementwiseAddDoubleGradOpArgumentMapping(
......@@ -91,13 +79,10 @@ KernelSignature ElementwiseAddTripleGradOpArgumentMapping(
KernelSignature ElementwiseSubGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("X")) {
return KernelSignature("subtract_grad",
{"X", "Y", GradVarName("Out")},
{"axis"},
{GradVarName("X"), GradVarName("Y")});
}
return KernelSignature("unregistered", {}, {}, {});
return KernelSignature("subtract_grad",
{"X", "Y", GradVarName("Out")},
{"axis"},
{GradVarName("X"), GradVarName("Y")});
}
KernelSignature ElementwiseSubDoubleGradOpArgumentMapping(
......@@ -116,7 +101,7 @@ KernelSignature ElementwiseDivGradOpArgumentMapping(
KernelSignature ElementwiseFMinGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("elementwise_fmin_grad",
return KernelSignature("fmin_grad",
{"X", "Y", GradVarName("Out")},
{"axis"},
{GradVarName("X"), GradVarName("Y")});
......@@ -138,9 +123,19 @@ KernelSignature ElementwiseMulGradOpArgumentMapping(
{GradVarName("X"), GradVarName("Y")});
}
KernelSignature ElementwiseFMaxOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("fmax", {"X", "Y"}, {"axis"}, {"Out"});
}
KernelSignature ElementwiseFMinOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("fmin", {"X", "Y"}, {"axis"}, {"Out"});
}
KernelSignature ElementwiseFMaxGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("elementwise_fmax_grad",
return KernelSignature("fmax_grad",
{"X", "Y", GradVarName("Out")},
{"axis"},
{GradVarName("X"), GradVarName("Y")});
......@@ -179,6 +174,10 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad, multiply_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad_grad, multiply_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_triple_grad, multiply_triple_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmax, fmax);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmin, fmin);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmax_grad, fmax_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmin_grad, fmin_grad);
PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
phi::ElementwiseAddOpArgumentMapping);
......@@ -208,9 +207,12 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad,
phi::ElementwiseMulDoubleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad,
phi::ElementwiseMulTripleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax,
phi::ElementwiseFMaxOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin,
phi::ElementwiseFMinOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax_grad,
phi::ElementwiseFMaxGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin_grad,
phi::ElementwiseFMinGradOpArgumentMapping);
......@@ -578,7 +578,7 @@ class Fleet(object):
@is_non_distributed_check
@inited_runtime_handler
def init_worker(self):
def init_worker(self, scopes=None):
"""
initialize `Communicator` for parameter server training.
......@@ -599,7 +599,7 @@ class Fleet(object):
fleet.init_worker()
"""
self._runtime_handle._init_worker()
self._runtime_handle._init_worker(scopes)
@is_non_distributed_check
@inited_runtime_handler
......@@ -1419,6 +1419,21 @@ class Fleet(object):
# for more examples, please reference https://github.com/PaddlePaddle/FleetX
"""
if not isinstance(loss, list):
return self._minimize_impl(loss, startup_program, parameter_list,
no_grad_set)
else:
if paddle.fluid.framework.in_dygraph_mode(
) or self._role_maker._is_non_distributed() or self._is_collective:
raise ValueError("loss can be list only in PS mode")
return self._minimize_losses_impl(loss, startup_program,
parameter_list, no_grad_set)
def _minimize_impl(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None):
context = {}
context["user_defined_strategy"] = copy.deepcopy(
self._user_defined_strategy)
......@@ -1447,6 +1462,7 @@ class Fleet(object):
"sharding_degree"]
context["origin_main_program"] = self.origin_main_program
context["origin_main_programs"] = [self.origin_main_program]
context["loss"] = loss
if startup_program == None:
self.origin_startup_program = \
......@@ -1457,6 +1473,7 @@ class Fleet(object):
startup_program.clone(for_test=False)
context["origin_startup_program"] = startup_program
context["origin_startup_programs"] = [startup_program]
context["role_maker"] = self._role_maker
# Use the auto-parallel's routines instead
......@@ -1512,6 +1529,8 @@ class Fleet(object):
copy_user_defined_strategy, can_not_apply_optimizer_list)
context["valid_strategy"] = copy.deepcopy(valid_strategy)
# print("valid_strategy:", context["valid_strategy"])
# print("user_defined_strategy:", context["user_defined_strategy"])
applied_meta_list = self.strategy_compiler._get_applied_meta_list()
applied_graph_list = self.strategy_compiler._get_applied_graph_list()
......@@ -1539,13 +1558,17 @@ class Fleet(object):
loss, startup_program, parameter_list, no_grad_set=no_grad_set)
if meta_optimizer:
# print("before minimize program id:", id(loss.block.program))
optimize_ops, params_grads = meta_optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set=no_grad_set)
# print("after minimize program id:", id(loss.block.program))
default_program = paddle.static.default_main_program()
# print("default program id:", id(default_program))
if id(default_program) != id(loss.block.program):
paddle.fluid.framework.switch_main_program(loss.block.program)
# print("default program id after switch:", id(default_program))
else:
optimize_ops, params_grads = self.user_defined_optimizer.minimize(
......@@ -1555,6 +1578,7 @@ class Fleet(object):
context["program_params_grads"] = params_grads
if graph_optimizer:
# print("before graph minimize program id:", id(loss.block.program))
optimize_ops, params_grads = graph_optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set=no_grad_set)
# since we do not encourage users to use graph operations
......@@ -1568,13 +1592,90 @@ class Fleet(object):
if not self._role_maker._is_heter_parameter_server_mode:
program = paddle.static.default_main_program()
opt_info = {}
opt_info = {} if program._fleet_opt is None else program._fleet_opt
opt_info["mpi_size"] = self.worker_num()
opt_info["mpi_rank"] = self.worker_index()
for k, v in self._user_defined_strategy.trainer_desc_configs.items(
):
opt_info[k] = v
program._fleet_opt = opt_info
if self._runtime_handle is None:
self._runtime_handle = RuntimeFactory()._create_runtime(context)
import paddle.distributed.fleet as fleet
fleet.util._set_strategy(context["valid_strategy"])
return optimize_ops, params_grads
def _minimize_losses_impl(self,
losses,
startup_programs=None,
parameter_list=None,
no_grad_set=None):
context = {}
# cache original feed forward program
self.origin_main_program = losses[0].block.program
context["origin_main_program"] = self.origin_main_program
context["origin_main_programs"] = []
for loss in losses:
context["origin_main_programs"].append(loss.block.program)
context["loss"] = losses
if startup_programs is None:
if len(losses) == 1:
startup_programs = [paddle.static.default_startup_program()]
else:
raise ValueError(
"startup_program can't be None when loss is list.")
self.origin_startup_program = startup_programs[0].clone(for_test=False)
context["origin_startup_program"] = startup_programs[0]
context["origin_startup_programs"] = []
for program in startup_programs:
context["origin_startup_programs"].append(program)
context["role_maker"] = self._role_maker
context["user_defined_strategy"] = copy.deepcopy(
self._user_defined_strategy)
context["valid_strategy"] = copy.deepcopy(self._user_defined_strategy)
self._context = context
self.valid_strategy = context["valid_strategy"]
self.valid_strategy._enable_env()
optimize_ops = []
params_grads = []
from ..meta_optimizers import ParameterServerOptimizer
ps_optimizer = ParameterServerOptimizer(self.user_defined_optimizer)
ps_optimizer._set_basic_info(losses, self._role_maker,
self.user_defined_optimizer,
self._user_defined_strategy)
optimize_ops, params_grads = ps_optimizer.minimize_losses_impl(
losses, startup_programs, parameter_list, no_grad_set=no_grad_set)
# default_program = paddle.static.default_main_program()
# if id(default_program) != id(losses[0].block.program):
# paddle.fluid.framework.switch_main_program(losses[0].block.program)
context["program_optimize_ops"] = optimize_ops
context["program_params_grads"] = params_grads
for loss in losses:
program = loss.block.program
opt_info = {} if program._fleet_opt is None else program._fleet_opt
opt_info["mpi_size"] = self.worker_num()
opt_info["mpi_rank"] = self.worker_index()
for k, v in self._user_defined_strategy.trainer_desc_configs.items(
):
opt_info[k] = v
program._fleet_opt = opt_info
# print("fleet base opt info:", id(program), program._fleet_opt)
if self._runtime_handle is None:
self._runtime_handle = RuntimeFactory()._create_runtime(context)
......
......@@ -13,7 +13,7 @@
# limitations under the License.
from ..runtime.collective_runtime import CollectiveRuntime
from ..runtime.parameter_server_runtime import ParameterServerRuntime
from ..runtime.the_one_ps import TheOnePSRuntime
from ...ps.the_one_ps import TheOnePSRuntime
__all__ = []
......
......@@ -17,7 +17,7 @@ from .asp_optimizer import ASPOptimizer
from .recompute_optimizer import RecomputeOptimizer
from .gradient_merge_optimizer import GradientMergeOptimizer
from .graph_execution_optimizer import GraphExecutionOptimizer
from .parameter_server_optimizer import ParameterServerOptimizer
from .ps_optimizer import ParameterServerOptimizer
from .pipeline_optimizer import PipelineOptimizer
from .localsgd_optimizer import LocalSGDOptimizer
from .localsgd_optimizer import AdaptiveLocalSGDOptimizer
......
......@@ -110,8 +110,9 @@ class ParameterServerOptimizer(MetaOptimizerBase):
no_grad_set)
if startup_program == None:
startup_program = paddle.static.default_startup_program()
print("program after inner optimizer minimize:",
str(loss.block.program))
# print("program after inner optimizer minimize:",
# str(loss.block.program))
self._set_origin_programs([loss])
self._init_ps_pass_context(loss, startup_program)
ps_builder = PsProgramBuilderFactory()._create_ps_program_builder(
......@@ -181,7 +182,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
if not var.persistable or var.desc.type(
) != core.VarDesc.VarType.LOD_TENSOR:
continue
set_var_lod_type(var)
param_memory_size += get_var_mem_size(var)
processed_var_names.add(varname)
......@@ -211,9 +211,8 @@ class ParameterServerOptimizer(MetaOptimizerBase):
data_count *= (-x)
else:
data_count *= x
program_tmp_vars[var_name] = (
data_count, neg_dim_count,
vars_metatools.dtype_to_size[var.dtype])
program_tmp_vars[var_name] = (data_count, neg_dim_count,
dtype_to_size[var.dtype])
for varname in program_tmp_vars:
data_count, neg_dim_count, type_size = program_tmp_vars[varname]
......@@ -228,12 +227,19 @@ class ParameterServerOptimizer(MetaOptimizerBase):
return False
def _enable_strategy(self, dist_strategy, context):
a_sync_configs = dist_strategy.a_sync_configs
if dist_strategy.a_sync_configs["k_steps"] >= 0:
return
dist_strategy.a_sync = True
a_sync_configs = dist_strategy.a_sync_configs
is_geo = self._can_apply_geo(context["origin_main_program"])
dist_strategy.a_sync_configs["k_steps"] = 800 if is_geo else 0
a_sync_configs["k_steps"] = 800 if is_geo else 0
dist_strategy.a_sync_configs = a_sync_configs
def _disable_strategy(self, dist_strategy):
dist_strategy.a_sync = False
a_sync_configs = dist_strategy.a_sync_configs
dist_strategy.a_sync_configs["k_steps"] = -1
dist_strategy.a_sync_configs = a_sync_configs
......@@ -62,9 +62,9 @@ def get_default_accessor_proto(accessor, varname, o_main_program):
if not accessor.HasField("accessor_class"):
accessor.accessor_class = "CtrCommonAccessor"
if not accessor.HasField("fea_dim"):
accessor.fea_dim = embedding_dim + 2
accessor.fea_dim = embedding_dim
if not accessor.HasField("embedx_dim"):
accessor.embedx_dim = embedding_dim - 1
accessor.embedx_dim = embedding_dim - 3
if not accessor.HasField("embedx_threshold"):
accessor.embedx_threshold = 0
......@@ -129,15 +129,15 @@ def check_embedding_dim(accessor, varname, o_main_program):
embedding_dim = var.shape[1]
break
fea_dim = accessor.fea_dim
if fea_dim != embedding_dim + 2:
if fea_dim != embedding_dim:
raise ValueError(
"The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}".
format(embedding_dim + 2, fea_dim))
"The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}".
format(embedding_dim, fea_dim))
embedx_dim = accessor.embedx_dim
if embedx_dim != embedding_dim - 1:
if embedx_dim != embedding_dim - 3:
raise ValueError(
"The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}".
format(embedding_dim - 1, embedx_dim))
"The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {}, but got {}".
format(embedding_dim - 3, embedx_dim))
class Accessor:
......@@ -927,7 +927,6 @@ class TheOnePSRuntime(RuntimeBase):
tables = []
for idx, (name, ctx) in enumerate(send_ctx.items()):
print(" wxm python test send_ctx.items-->", idx, (name, ctx))
if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1:
continue
......
......@@ -75,7 +75,7 @@ class DistributedInfer:
if self.sparse_table_maps is None:
self.sparse_table_maps = {}
send_ctx = fleet.fleet._runtime_handle._communicator.send_ctx_
send_ctx = fleet.fleet._runtime_handle._send_ctx
for gradname, ctx in send_ctx.items():
if ctx.is_sparse:
param = gradname.strip("@GRAD")
......
......@@ -155,8 +155,6 @@ class AddListenAndServPass(PassBase):
main_program.global_block().append_op(
type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=opt)
attrs['cloned_main'] = main_program
@register_pass("add_rpc_global_flags_pass")
class AddRpcGlobalFlagsPass(PassBase):
......
......@@ -116,7 +116,7 @@ class DistributedOpsPass(PassBase):
def _check_conflict(self, other_pass):
return True
def _push_sparse_fuse(self, _program, push_sparse_ops, attrs):
def _push_sparse_fuse(self, _program, push_sparse_ops, attrs, use_cvm_op):
if attrs['use_ps_gpu']:
return
if len(push_sparse_ops) == 0:
......@@ -211,7 +211,8 @@ class DistributedOpsPass(PassBase):
"is_distributed": is_distributed,
"padding_idx": padding_idx,
"table_id": table_id,
"size": self.emb_size[param]
"size": self.emb_size[param],
"use_cvm_op": use_cvm_op
})
def _pull_sparse_fuse(self, _program, pull_sparse_ops, attrs, send_ctx):
......@@ -420,6 +421,7 @@ class DistributedOpsPass(PassBase):
pull_sparse_ids = {}
push_sparse_ops = {}
ops = {}
use_cvm_op = False
for op in _program.global_block().ops:
if op.type in SPARSE_OP_TYPE_DICT.keys() \
and op.attr('remote_prefetch') is True:
......@@ -433,6 +435,9 @@ class DistributedOpsPass(PassBase):
ids = pull_sparse_ids.get(param_name, [])
ids.append(op.input("Ids")[0])
pull_sparse_ids[param_name] = ids
if op.type == 'cvm':
use_cvm_op = True
for op in _program.global_block().ops:
if op.type in SPARSE_GRAD_OP_TYPE_DICT.keys():
param_name = op.input(SPARSE_GRAD_OP_TYPE_DICT[op.type])[0]
......@@ -442,16 +447,16 @@ class DistributedOpsPass(PassBase):
ops.append(op)
push_sparse_ops[param_name] = ops
return pull_sparse_ops, push_sparse_ops
return pull_sparse_ops, push_sparse_ops, use_cvm_op
def _apply_single_impl(self, main_program, startup_program, pass_ctx):
attrs = pass_ctx._attrs
pull_sparse_ops, push_sparse_ops = self._get_pull_sparse_ops(
pull_sparse_ops, push_sparse_ops, use_cvm_op = self._get_pull_sparse_ops(
main_program, attrs)
send_ctx = get_the_one_send_context(
attrs, split_dense_table=attrs['is_heter_ps_mode'])
self._pull_sparse_fuse(main_program, pull_sparse_ops, attrs, send_ctx)
self._push_sparse_fuse(main_program, push_sparse_ops, attrs)
self._push_sparse_fuse(main_program, push_sparse_ops, attrs, use_cvm_op)
@register_pass("delete_optimizer_pass")
......
......@@ -15,7 +15,7 @@
import warnings
import os
from paddle.distributed.fleet.proto import ps_pb2
import paddle.distributed.fleet.proto.the_one_ps_pb2 as ps_pb2
import paddle.fluid as fluid
import paddle.distributed.fleet as fleet
from paddle.fluid import core
......@@ -68,16 +68,30 @@ def check_embedding_dim(accessor_proto, varname, program_id, context):
print('new var: {}, {}, {}'.format(var, embedding_dim,
accessor_proto.fea_dim))
break
fea_dim = accessor_proto.fea_dim
if fea_dim != embedding_dim + 2:
raise ValueError(
"The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}".
format(embedding_dim + 2, fea_dim))
if accessor_proto.accessor_class == "SparseAccessor":
if fea_dim != embedding_dim + 2:
raise ValueError(
"The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}".
format(embedding_dim + 2, fea_dim))
else:
if fea_dim != embedding_dim:
raise ValueError(
"The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}".
format(embedding_dim, fea_dim))
embedx_dim = accessor_proto.embedx_dim
if embedx_dim != embedding_dim - 1:
raise ValueError(
"The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}".
format(embedding_dim - 1, embedx_dim))
if accessor_proto.accessor_class == "SparseAccessor":
if embedx_dim != embedding_dim - 1:
raise ValueError(
"The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}".
format(embedding_dim - 1, embedx_dim))
else:
if embedx_dim != embedding_dim - 3:
raise ValueError(
"The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {}, but got {}".
format(embedding_dim - 3, embedx_dim))
class Service:
......@@ -119,11 +133,18 @@ class Accessor:
break
if not accessor_proto.HasField("accessor_class"):
accessor_proto.accessor_class = "CtrCommonAccessor"
# DownpourSparseValueAccessor
accessor_proto.accessor_class = "SparseAccessor"
if not accessor_proto.HasField("fea_dim"):
accessor_proto.fea_dim = embedding_dim + 2
if accessor_proto.accessor_class == "SparseAccessor":
accessor_proto.fea_dim = embedding_dim + 2
else:
accessor_proto.fea_dim = embedding_dim
if not accessor_proto.HasField("embedx_dim"):
accessor_proto.embedx_dim = embedding_dim - 1
if accessor_proto.accessor_class == "SparseAccessor":
accessor_proto.embedx_dim = embedding_dim - 1
else:
accessor_proto.embedx_dim = embedding_dim - 3
if not accessor_proto.HasField("embedx_threshold"):
accessor_proto.embedx_threshold = 0
......@@ -268,16 +289,16 @@ class CommonAccessor(Accessor):
attr_str = ""
origin_var_name = value_name
print("get_initializer_attr param name:", value_name)
# print("get_initializer_attr param name:", value_name)
for op in o_startup_program.global_block().ops:
if op.type in self.opt_init_map.keys(
) and origin_var_name == op.output("Out")[0]:
init_attr = [op.type]
print("get_initializer_attr op type:", op.type)
# print("get_initializer_attr op type:", op.type)
for attr in self.opt_init_map[op.type]:
print("get_initializer_attr opt_init_map attr:", attr)
# print("get_initializer_attr opt_init_map attr:", attr)
init_attr.append(str(op.attr(attr)))
print("get_initializer_attr op attr:", str(op.attr(attr)))
# print("get_initializer_attr op attr:", str(op.attr(attr)))
attr_str = l_in.join(init_attr)
break
return attr_str
......@@ -288,16 +309,16 @@ class CommonAccessor(Accessor):
size = ctx.sections()[0]
single_dim = ctx.sections()[1] if ctx.is_sparse() else 1
adam_d2sum = context["user_defined_strategy"].adam_d2sum
print("parse_by_optimizer table_id:{} is_datanorm:{}".format(
ctx.table_id(), ctx.is_datanorm_table()))
# print("parse_by_optimizer table_id:{} is_datanorm:{}".format(
# ctx.table_id(), ctx.is_datanorm_table()))
main_program, startup_program, idx = get_program_by_id(context,
ctx.program_id())
pserver_id = get_role_id(context['role_maker'])
pserver_num = len(get_ps_endpoints(context['role_maker']))
optimizer_ops = get_optimize_ops(main_program)
print("the one ps optimizer_ops:", optimizer_ops)
print("the one ps parse_by_optimizer grad_name:", grad_name)
# print("the one ps optimizer_ops:", optimizer_ops)
# print("the one ps parse_by_optimizer grad_name:", grad_name)
oop = None
for op in optimizer_ops:
......@@ -394,7 +415,7 @@ class CommonAccessor(Accessor):
initializer = self.get_initializer_attr(param.name,
startup_program)
elif formal_name == "SummaryDecayRate":
initializer = "fill_constant&0.99999"
initializer = "fill_constant&0.999999"
else:
initializer = "fill_constant&0"
initializers.append(initializer)
......@@ -740,7 +761,6 @@ class PsDescBuilder(object):
def _get_tables(self):
tables = []
for idx, (name, ctx) in enumerate(self.send_ctx.items()):
print('####### {}\n'.format(ctx.is_sparse()))
if ctx.is_sparse():
if self.ps_mode == DistributedMode.GEO:
tables.append(globals()['GeoSparseTable'](self.context,
......@@ -778,11 +798,11 @@ class PsDescBuilder(object):
return text_format.MessageToString(self.ps_desc)
def build_server_desc(self):
self.sparse_table_maps = {}
for table in self.tables:
table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add(
)
table._set(table_proto)
self.sparse_table_maps = {}
if table_proto.type == ps_pb2.PS_SPARSE_TABLE and table_proto.common is not None:
self.sparse_table_maps[
table_proto.common.table_name] = table_proto.table_id
......@@ -801,6 +821,7 @@ class TheOnePSRuntime(RuntimeBase):
self._worker = fluid.core.DistFleetWrapper()
self._server_sub_program = []
self._heter_client = None
self._send_ctx = None
def _set_basic_info(self, context):
self.context = context
......@@ -835,7 +856,40 @@ class TheOnePSRuntime(RuntimeBase):
self.ps_desc_builder = PsDescBuilder(self.context)
def _init_worker(self):
def _init_params(self, scopes, send_ctx, recv_map):
for name, ctx in send_ctx.items():
if ctx.is_sparse():
continue
_, _, idx = get_program_by_id(self.context, ctx.program_id())
scope = scopes[idx]
table_id = ctx.table_id()
var_names = recv_map[table_id]
# print("init params:", idx, table_id, var_names)
self._worker.push_dense_params(scope, table_id, var_names)
def _pull_all_dense(self, scopes, send_ctx, recv_map):
for name, ctx in send_ctx.items():
if ctx.is_sparse():
continue
_, _, idx = get_program_by_id(self.context, ctx.program_id())
scope = scopes[idx]
table_id = ctx.table_id()
var_names = recv_map[table_id]
# print("pull all dense:", idx, table_id, var_names)
self._worker.pull_dense_params(scope, table_id, var_names)
def _pull_dense(self, program, scope, send_ctx, recv_map):
for name, ctx in send_ctx.items():
if ctx.is_sparse():
continue
if ctx.program_id() != id(program):
continue
table_id = ctx.table_id()
var_names = recv_map[table_id]
# print("pull dense:", table_id, var_names)
self._worker.pull_dense_params(scope, table_id, var_names)
def _init_worker(self, scopes=None):
worker_desc = self.ps_desc_builder.build_worker_desc()
if self.context['use_ps_gpu']:
......@@ -866,6 +920,7 @@ class TheOnePSRuntime(RuntimeBase):
split_dense_table=self.is_heter_ps_mode,
use_origin_program=self.is_heter_ps_mode,
ep_list=self.endpoints)
self._send_ctx = send_ctx
trainer_config = self.context['trainer']
debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
......@@ -889,23 +944,32 @@ class TheOnePSRuntime(RuntimeBase):
kwargs.update(sync_kwargs)
print("communicator config:", trainer_config.get_communicator_flags())
self._communicator = Communicator(
trainer_config.mode, kwargs,
trainer_config.get_communicator_flags())
self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt,
self.string_hosts,
fluid.global_scope())
role_id = get_role_id(self.role_maker)
self._worker.init_worker(proto_txt, self.string_hosts, role_id)
if self.context['ps_mode'] == DistributedMode.GEO:
self._communicator = Communicator(
trainer_config.mode, kwargs,
trainer_config.get_communicator_flags())
self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt,
self.string_hosts,
fluid.global_scope())
fleet.util.barrier()
info = self._communicator.get_client_info()
# info = self._communicator.get_client_info()
info = self._worker.get_client_info()
if isinstance(info, list) and len(info) > 0:
all_info = self.role_maker._all_gather(info[0])
# for unittest
if not isinstance(all_info, list):
warnings.warn("gloo may not initialize correctly")
all_info = [all_info]
self._communicator.set_clients(all_info)
self._communicator.create_client_to_client_connection()
# self._communicator.set_clients(all_info)
# self._communicator.create_client_to_client_connection()
self._worker.set_clients(all_info)
self._worker.create_client2client_connection()
print('create c2c connection done')
else:
print('cannot create c2c connection')
......@@ -914,6 +978,7 @@ class TheOnePSRuntime(RuntimeBase):
is_test = bool(int(os.getenv("TEST_MODE", "0")))
# for GEO
if self.role_maker._is_first_worker() and self.is_heter_ps_mode:
# for ps-heter mode load all parameters on first_worker
init_params = get_the_one_recv_context(
......@@ -921,16 +986,38 @@ class TheOnePSRuntime(RuntimeBase):
else:
init_params = dense_map
# if not is_test:
# self._communicator.init_params(init_params)
# fleet.util.barrier()
# self._communicator.pull_dense(init_params)
# fleet.util.barrier()
if scopes is None:
if len(self.origin_main_programs) > 1:
raise ValueError(
"You must set the scope list when you have Multiple programs"
)
scopes = [fluid.global_scope()]
if len(self.origin_main_programs) != len(scopes):
raise VauleError("len(programs) != len(scopes)")
self.scopes = scopes
if not is_test:
self._communicator.init_params(init_params)
if self.context['ps_mode'] == DistributedMode.GEO:
self._communicator.init_params(init_params)
else:
if role_id == 0:
self._init_params(scopes, send_ctx, dense_map)
fleet.util.barrier()
self._communicator.pull_dense(init_params)
self._pull_all_dense(scopes, send_ctx, dense_map)
fleet.util.barrier()
if not self._communicator.is_running():
self._communicator.start()
else:
warnings.warn("communicator has been initialized, skip")
if self.context['ps_mode'] == DistributedMode.GEO:
if not self._communicator.is_running():
self._communicator.start()
else:
warnings.warn("communicator has been initialized, skip")
launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
......@@ -996,7 +1083,9 @@ class TheOnePSRuntime(RuntimeBase):
self._server.run_server(host, int(port))
def _stop_worker(self):
self._communicator.stop()
if self.context['ps_mode'] == DistributedMode.GEO:
self._communicator.stop()
self._worker.stop_worker()
if self.is_heter_ps_mode:
assert self._heter_client != None, "heter client should not be None in heterps mode"
self._heter_client.stop()
......@@ -1151,7 +1240,11 @@ class TheOnePSRuntime(RuntimeBase):
"in fleet.save() function, executor must be as Executor type")
import paddle
program = self.origin_main_program if main_program is None else main_program
program = self.origin_main_programs[
0] if main_program is None else main_program
_, _, idx = get_program_by_id(self.context, id(program))
scope = self.scopes[idx]
print("save inference model scope idx:", idx)
if isinstance(program, CompiledProgram):
raise TypeError(
......@@ -1180,12 +1273,14 @@ class TheOnePSRuntime(RuntimeBase):
sparse_names = self._save_sparse_params(executor, dirname, sparses,
main_program, mode)
denses = get_the_one_recv_context(
dense_map = get_the_one_recv_context(
self.context, split_dense_table=self.is_heter_ps_mode)
send_ctx = get_the_one_send_context(
self.context,
is_dense=True,
split_dense_table=self.is_heter_ps_mode,
use_origin_program=True)
self._communicator.pull_dense(denses)
use_origin_program=self.is_heter_ps_mode,
ep_list=self.endpoints)
self._pull_dense(program, scope, send_ctx, dense_map)
generate_vars = self.context[
"user_defined_strategy"].trainer_desc_configs["stat_var_names"]
......@@ -1196,7 +1291,7 @@ class TheOnePSRuntime(RuntimeBase):
infer_program.list_vars()))
for var in remaining_vars:
tensor = var.get_value()
tensor = var.get_value(scope)
paddle.save(
tensor,
os.path.join(model_path, var.name),
......
......@@ -37,6 +37,37 @@ class PsProgramBuilder(object):
self.server_endpoints = self.attrs['role_maker']._get_pserver_endpoints(
)
def _build_trainer_desc(self):
opt_info = self.loss.block.program._fleet_opt
opt_info = {} if opt_info is None else opt_info
opt_info["trainer"] = opt_info.get("trainer", "DistMultiTrainer")
opt_info["device_worker"] = opt_info.get("device_worker",
"DownpourLite")
pid = str(id(self.cloned_main))
program_configs = {
pid: {
'pull_dense': [],
'push_dense': [],
'pull_sparse': [],
'push_sparse': []
}
}
dense_table_config = {}
send_ctx = get_the_one_send_context(self.attrs)
recv_ctx = get_the_one_recv_context(self.attrs)
for name, ctx in send_ctx.items():
if ctx.program_id() != id(self.loss.block.program):
continue
if ctx.is_sparse():
continue
if not ctx.is_tensor_table():
program_configs[pid]['pull_dense'].append(ctx.table_id())
program_configs[pid]['push_dense'].append(ctx.table_id())
dense_table_config[ctx.table_id()] = recv_ctx[ctx.table_id()]
opt_info['program_configs'] = program_configs
opt_info['dense_table_config'] = dense_table_config
self.cloned_main._fleet_opt = opt_info
def _optimize_programs(self):
pass
......@@ -63,7 +94,15 @@ class PsProgramBuilder(object):
logger.info("start building trainer program")
self._build_trainer_programs()
fluid.framework.switch_startup_program(self.cloned_startup)
# print("ps_program_build before =", id(self.loss.block.program))
self._build_trainer_desc()
self.loss.block.program = self.cloned_main
# print("ps_program_build after =", id(self.loss.block.program))
# print("ps_program_build clone after =", id(self.cloned_main))
# print("ps_program_build after trainer_desc",
# id(self.loss.block.program))
# print("ps_program build trainer desc",
# self.loss.block.program._fleet_opt)
elif self.attrs['is_server']:
logger.info("start building pserver program")
......@@ -92,6 +131,13 @@ class GeoPsProgramBuilder(PsProgramBuilder): # 仅 CPU 模式
return
def _build_pserver_programs(self):
add_listen_and_serv_pass = new_pass('add_listen_and_serv_pass',
self.attrs)
add_listen_and_serv_pass.apply([self.attrs['_main_server']], [None],
self.pass_ctx)
return
class CpuSyncPsProgramBuilder(PsProgramBuilder):
def __init__(self, pass_ctx):
......@@ -103,13 +149,13 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
format(self.ps_mode, "PsProgramBuilder"))
def _build_trainer_programs(self):
print("build trainer program entry")
print("before ps program builder program:", self.cloned_main)
# print("build trainer program entry")
# print("before ps program builder program:", self.cloned_main)
add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
self.attrs)
add_lr_decay_table_pass.apply([], [], self.pass_ctx)
print("before distributed op pass")
# print("before distributed op pass")
distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs)
distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)
......@@ -129,7 +175,7 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
self.attrs['origin_main_program'] = self.cloned_main
self.attrs['origin_startup_program'] = self.cloned_startup
print("after ps program builder program:", self.cloned_main)
# print("after ps program builder program:", self.cloned_main)
if self.launch_barrier and self.launch_barrier_flag:
wait_server_ready(self.server_endpoints)
......
......@@ -23,7 +23,6 @@ import logging
import six
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.core import CommContext
import paddle.fluid.framework as framework
import paddle.distributed.fleet as fleet
......@@ -73,9 +72,9 @@ def logger_config(log_path, logging_name):
return logger
ps_log_root_dir = '/ps_log/'
ps_log_root_dir = './ps_log/'
logger = logger_config(
log_path='/ps_usr_print_log', logging_name='ps_usr_print_log')
log_path='./ps_usr_print_log', logging_name='ps_usr_print_log')
class DistributedMode:
......@@ -342,6 +341,7 @@ def get_dense_send_context(program,
aggregate = True
print("public get_dense_send_context dense_table:", grad_name,
var_numel, origin_varnames)
from paddle.fluid.core import CommContext
dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
[var_numel], origin_varnames, trainer_id,
aggregate, False, False, idx, False, False,
......@@ -364,6 +364,7 @@ def get_dense_send_context(program,
aggregate = True
print("public get_dense_send_context data_norm table:", grad_name,
var_numel, origin_varnames)
from paddle.fluid.core import CommContext
data_norm_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
[var_numel], origin_varnames, trainer_id,
aggregate, False, False, idx, False, True,
......@@ -378,6 +379,7 @@ def get_dense_send_context(program,
var_numel = reduce(lambda x, y: x * y, var.shape)
grad_name = origin_varname
aggregate = True
from paddle.fluid.core import CommContext
dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
[var_numel], [origin_varname], trainer_id,
aggregate, False, False, idx, False, False,
......@@ -407,7 +409,7 @@ def get_geo_trainer_send_context(context):
var = program.global_block().vars[grad.merged_var.name]
var_numel = reduce(lambda x, y: x * y, var.shape[1:])
from paddle.fluid.core import CommContext
sparse_ctx = CommContext(grad_name, [grad_name],
["127.0.0.1:6071"], [var_numel],
[grad_name], trainer_id, True, True,
......@@ -432,6 +434,7 @@ def _step_ctx(idx, role_maker):
endpoints = get_ps_endpoints(role_maker)
sections = [1] * len(endpoints)
names = [name] * len(endpoints)
from paddle.fluid.core import CommContext
ctx = CommContext(name, names, endpoints, sections, [name], trainer_id,
True, False, False, idx, True, False, -1)
return name, ctx
......@@ -448,12 +451,8 @@ def get_the_one_send_context(context,
origin_programs = context['origin_main_programs']
idx = 0
for i, program in enumerate(origin_programs):
merged_dense_pairs = context['merged_dense_pairs'][i]
idx = get_dense_send_context(program, send_ctx, idx, merged_dense_pairs,
trainer_id, split_dense_table)
distibuted_varnames = get_sparse_tablenames(origin_programs, True)
print("public distibuted_varnames:", distibuted_varnames)
# print("public distibuted_varnames:", distibuted_varnames)
for i, program in enumerate(origin_programs):
merged_sparse_pairs = context['merged_sparse_pairs'][i]
for merged in merged_sparse_pairs:
......@@ -472,10 +471,11 @@ def get_the_one_send_context(context,
shape = list(var.shape)
shape[0] = 0 if is_distributed else shape[0]
print("public get_the_one_send_context sparse:", grad_name,
splited_varname, shape)
# print("public get_the_one_send_context sparse:", grad_name,
# splited_varname, shape)
if grad_name in send_ctx:
continue
from paddle.fluid.core import CommContext
sparse_ctx = CommContext(grad_name, splited_varname, ep_list, shape,
[grad_name], trainer_id, True, True,
is_distributed, idx, False, False,
......@@ -484,6 +484,11 @@ def get_the_one_send_context(context,
idx += 1
send_ctx[sparse_ctx.var_name()] = sparse_ctx
for i, program in enumerate(origin_programs):
merged_dense_pairs = context['merged_dense_pairs'][i]
idx = get_dense_send_context(program, send_ctx, idx, merged_dense_pairs,
trainer_id, split_dense_table)
if len(context['tensor_table']) > 0 and context['is_worker']:
name, ctx = _step_ctx(idx, context['role_maker'])
send_ctx[name] = ctx
......@@ -1258,8 +1263,8 @@ def build_var_distributed(context):
context["merged_variable_map"] = {}
for origin_program in origin_programs:
sparse_pairs, dense_pairs = get_param_grads(origin_program)
print("public build_var_distributed sparse_pairs:", sparse_pairs)
print("public build_var_distributed dense_pairs:", dense_pairs)
# print("public build_var_distributed sparse_pairs:", sparse_pairs)
# print("public build_var_distributed dense_pairs:", dense_pairs)
origin_for_sparse = []
origin_for_dense = []
merged_sparse_pairs = []
......@@ -1279,8 +1284,8 @@ def build_var_distributed(context):
m_grad = MergedVariable(grad, [grad], [0])
merged_variables_pairs.append((m_param, m_grad))
merged_dense_pairs.append((m_param, m_grad))
print("public build_var_distributed merged_dense_pairs:",
merged_dense_pairs)
# print("public build_var_distributed merged_dense_pairs:",
# merged_dense_pairs)
for sparse_pair in origin_for_sparse:
param, grad = sparse_pair
......@@ -1289,8 +1294,8 @@ def build_var_distributed(context):
m_grad = MergedVariable(grad, [grad], [0])
merged_variables_pairs.append((m_param, m_grad))
merged_sparse_pairs.append((m_param, m_grad))
print("public build_var_distributed merged_sparse_pairs:",
merged_sparse_pairs)
# print("public build_var_distributed merged_sparse_pairs:",
# merged_sparse_pairs)
for merged in merged_variables_pairs:
m_param, m_grad = merged
......@@ -1315,18 +1320,19 @@ def build_var_distributed(context):
context["param_name_to_grad_name"] = param_name_to_grad_name
context["grad_name_to_param_name"] = grad_name_to_param_name
print("public build_var_distributed origin_sparse_pairs:",
context["origin_sparse_pairs"])
print("public build_var_distributed origin_for_dense:",
context["origin_dense_pairs"])
print("public build_var_distributed merged_sparse_pairs:",
context["merged_sparse_pairs"])
print("public build_var_distributed merged_dense_pairs:",
context['merged_dense_pairs'])
print("public build_var_distributed param_name_to_grad_name:",
param_name_to_grad_name)
print("public build_var_distributed grad_name_to_param_name:",
grad_name_to_param_name)
# print("public build_var_distributed origin_sparse_pairs:",
# context["origin_sparse_pairs"])
# print("public build_var_distributed origin_for_dense:",
# context["origin_dense_pairs"])
# print("public build_var_distributed merged_sparse_pairs:",
# context["merged_sparse_pairs"])
# print("public build_var_distributed merged_dense_pairs:",
# context['merged_dense_pairs'])
# print("public build_var_distributed param_name_to_grad_name:",
# param_name_to_grad_name)
# print("public build_var_distributed grad_name_to_param_name:",
# grad_name_to_param_name)
def _is_opt_role_op(op):
......
......@@ -62,13 +62,18 @@ class Communicator(object):
"""
# set all recv op to not_run mode
if mode == DistributedMode.SYNC:
envs["pserver_endpoints"] = ','.join(kwargs["pserver_endpoints"])
envs["trainers"] = str(kwargs["trainers"])
envs["trainer_id"] = str(kwargs["trainer_id"])
envs["need_global_step"] = str(kwargs["need_global_step"])
envs["barrier_table_id"] = str(kwargs["barrier_table_id"])
if kwargs == None:
if envs == None:
envs = {}
else:
if mode == DistributedMode.SYNC:
envs["pserver_endpoints"] = ','.join(kwargs[
"pserver_endpoints"])
envs["trainers"] = str(kwargs["trainers"])
envs["trainer_id"] = str(kwargs["trainer_id"])
envs["need_global_step"] = str(kwargs["need_global_step"])
envs["barrier_table_id"] = str(kwargs["barrier_table_id"])
mode_str = None
......@@ -129,6 +134,9 @@ class Communicator(object):
comm.start()
comm.stop()
"""
if self.communicator_ == None:
print('you must call init_with_ctx first to init comm before start')
return
self.communicator_.start()
def stop(self):
......@@ -148,6 +156,9 @@ class Communicator(object):
comm.start()
comm.stop()
"""
if self.communicator_ == None:
print('you must call init_with_ctx first to init comm before stop')
return
self.communicator_.stop()
def is_running(self):
......@@ -166,6 +177,9 @@ class Communicator(object):
comm = fluid.communicator.Communicator(prog)
comm.is_running()
"""
if self.communicator_ == None:
print('you must call init_with_ctx first to init comm before stop')
return
self.communicator_.is_running()
def recv(self):
......
......@@ -862,9 +862,9 @@ class InMemoryDataset(DatasetBase):
thread_num(int): shuffle thread num. Default is 12.
"""
from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
if fleet is not None:
if not isinstance(fleet, PSLib):
if hasattr(fleet, "barrier_worker"):
print("pscore fleet")
fleet.barrier_worker()
else:
fleet._role_maker.barrier_worker()
......@@ -879,20 +879,20 @@ class InMemoryDataset(DatasetBase):
self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size)
self.dataset.set_fleet_send_sleep_seconds(self.fleet_send_sleep_seconds)
if fleet is not None:
if not isinstance(fleet, PSLib):
if hasattr(fleet, "barrier_worker"):
fleet.barrier_worker()
else:
fleet._role_maker.barrier_worker()
self.dataset.global_shuffle(thread_num)
if fleet is not None:
if not isinstance(fleet, PSLib):
if hasattr(fleet, "barrier_worker"):
fleet.barrier_worker()
else:
fleet._role_maker.barrier_worker()
if self.merge_by_lineid:
self.dataset.merge_by_lineid()
if fleet is not None:
if not isinstance(fleet, PSLib):
if hasattr(fleet, "barrier_worker"):
fleet.barrier_worker()
else:
fleet._role_maker.barrier_worker()
......@@ -1026,9 +1026,8 @@ class InMemoryDataset(DatasetBase):
local_data_size = np.array([local_data_size])
print('global shuffle local_data_size: ', local_data_size)
if fleet is not None:
from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
global_data_size = local_data_size * 0
if not isinstance(fleet, PSLib):
if hasattr(fleet, "util"):
global_data_size = fleet.util.all_reduce(local_data_size)
else:
fleet._role_maker.all_reduce_worker(local_data_size,
......
......@@ -99,6 +99,7 @@ class Hogwild(DeviceWorker):
dense_table_set = set()
program_id = str(id(self._program))
print("device worker program id:", program_id)
if self._program == None:
print("program of current device worker is not configured")
exit(-1)
......@@ -115,15 +116,20 @@ class Hogwild(DeviceWorker):
from paddle.fluid.incubate.fleet.parameter_server import version
if version.is_transpiler() and "fleet_desc" not in opt_info:
if version.is_transpiler(
) and "fleet_desc" not in opt_info and "program_configs" not in opt_info:
return
program_configs = opt_info["program_configs"]
print("device worker program_configs:", program_configs)
for pid in program_configs:
print("device worker", pid, program_id)
if pid == program_id:
pc = downpour.program_config.add()
pc.program_id = program_id
print("device worker pull dense:",
program_configs[program_id]["pull_dense"])
for i in program_configs[program_id]["push_sparse"]:
pc.push_sparse_table_id.extend([i])
for i in program_configs[program_id]["push_dense"]:
......@@ -139,50 +145,189 @@ class Hogwild(DeviceWorker):
trainer_desc.device_worker_name = "HogwildWorker"
pull_thread = trainer_desc.pull_dense_param
pull_thread.device_num = trainer_desc.thread_num
if opt_info.get("program_id_to_worker") is None:
raise ValueError("opt_info must have program_id_to_worker")
prog_id_to_worker = opt_info["program_id_to_worker"]
if prog_id_to_worker.get(program_id) is None:
raise ValueError("%s not found in program_id_to_worker" %
program_id)
worker = opt_info["program_id_to_worker"][program_id]
for i in worker.get_desc().dense_table:
if i.table_id in dense_table_set:
if opt_info.get("program_id_to_worker") is None and opt_info.get(
"dense_table_config") is None:
raise ValueError(
"opt_info must have program_id_to_worker or dense_table_config")
if opt_info.get("program_id_to_worker") is not None:
prog_id_to_worker = opt_info["program_id_to_worker"]
if prog_id_to_worker.get(program_id) is None:
raise ValueError("%s not found in program_id_to_worker" %
program_id)
worker = opt_info["program_id_to_worker"][program_id]
for i in worker.get_desc().dense_table:
if i.table_id in dense_table_set:
dense_table = pull_thread.dense_table.add()
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.table_id = \
i.table_id
sparse_len = len(worker.get_desc().sparse_table)
for i in range(sparse_len):
sparse_table = downpour.sparse_table.add()
sparse_table.table_id = worker.get_desc().sparse_table[
i].table_id
sparse_table.sparse_key_name.extend(worker.get_desc()
.sparse_table[i].slot_key)
sparse_table.sparse_value_name.extend(worker.get_desc(
).sparse_table[i].slot_value)
sparse_table.sparse_grad_name.extend(worker.get_desc(
).sparse_table[i].slot_gradient)
sparse_table.fea_dim = \
self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
i].accessor.fea_dim
# not use emb_dim
sparse_table.emb_dim = -1
# not use hard code click
sparse_table.label_var_name = ""
for i in worker.get_desc().dense_table:
if i.table_id in dense_table_set:
dense_table = downpour.dense_table.add()
dense_table.table_id = i.table_id
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.dense_grad_name.extend(
i.dense_gradient_variable_name)
hogwild.skip_ops.extend(worker.get_desc().skip_op)
else:
dense_table_config = opt_info.get("dense_table_config")
print("device worker dense_table_config:", dense_table_config)
for table_id, varnames in dense_table_config.items():
dense_table = pull_thread.dense_table.add()
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.table_id = \
i.table_id
sparse_len = len(worker.get_desc().sparse_table)
for i in range(sparse_len):
sparse_table = downpour.sparse_table.add()
sparse_table.table_id = worker.get_desc().sparse_table[i].table_id
sparse_table.sparse_key_name.extend(worker.get_desc().sparse_table[
i].slot_key)
sparse_table.sparse_value_name.extend(worker.get_desc()
.sparse_table[i].slot_value)
sparse_table.sparse_grad_name.extend(worker.get_desc().sparse_table[
i].slot_gradient)
sparse_table.fea_dim = \
self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
i].accessor.fea_dim
# not use emb_dim
sparse_table.emb_dim = -1
# not use hard code click
sparse_table.label_var_name = ""
dense_table.dense_value_name.extend(varnames)
dense_table.table_id = table_id
for i in worker.get_desc().dense_table:
if i.table_id in dense_table_set:
dense_table = downpour.dense_table.add()
dense_table.table_id = i.table_id
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.dense_grad_name.extend(
i.dense_gradient_variable_name)
hogwild.skip_ops.extend(worker.get_desc().skip_op)
if self._infer:
hogwild.skip_ops.extend(
["push_sparse", "push_sparse_v2", "push_dense"])
class DownpourLite(DeviceWorker):
"""
DownpourLite is a kind of SGD algorithm.
"""
def __init__(self):
"""Init."""
super(DownpourLite, self).__init__()
def _gen_worker_desc(self, trainer_desc):
"""
Generator worker desc, which device worker is DownpourLiteWorker.
Args:
trainer_desc(TrainerDesc): a TrainerDesc object
"""
print("create DownpourLiteWorker")
trainer_desc.device_worker_name = "DownpourLiteWorker"
if self._infer:
# just ignore feed op for inference model
trainer_desc.downpour_param.skip_ops.extend([
"feed", "push_sparse", "push_sparse_v2", "push_dense",
"distributed_push_sparse", "send"
])
dense_table_set = set()
program_id = str(id(self._program))
print("device worker program id:", program_id)
if self._program == None:
print("program of current device worker is not configured")
exit(-1)
opt_info = self._program._fleet_opt
# when opt_info is None or empty dict, it should return
if not opt_info:
return
downpour = trainer_desc.downpour_param
if opt_info["stat_var_names"]:
for i in opt_info["stat_var_names"]:
downpour.stat_var_names.extend([i])
from paddle.fluid.incubate.fleet.parameter_server import version
if version.is_transpiler(
) and "fleet_desc" not in opt_info and "program_configs" not in opt_info:
return
program_configs = opt_info["program_configs"]
print("device worker program_configs:", program_configs)
for pid in program_configs:
print("device worker", pid, program_id)
if pid == program_id:
pc = downpour.program_config.add()
pc.program_id = program_id
print("device worker pull dense:",
program_configs[program_id]["pull_dense"])
for i in program_configs[program_id]["push_sparse"]:
pc.push_sparse_table_id.extend([i])
for i in program_configs[program_id]["push_dense"]:
pc.push_dense_table_id.extend([i])
dense_table_set.add(i)
for i in program_configs[program_id]["pull_sparse"]:
pc.pull_sparse_table_id.extend([i])
for i in program_configs[program_id]["pull_dense"]:
pc.pull_dense_table_id.extend([i])
dense_table_set.add(i)
break
pull_thread = trainer_desc.pull_dense_param
pull_thread.device_num = trainer_desc.thread_num
if opt_info.get("program_id_to_worker") is None and opt_info.get(
"dense_table_config") is None:
raise ValueError(
"opt_info must have program_id_to_worker or dense_table_config")
if opt_info.get("program_id_to_worker") is not None:
prog_id_to_worker = opt_info["program_id_to_worker"]
if prog_id_to_worker.get(program_id) is None:
raise ValueError("%s not found in program_id_to_worker" %
program_id)
worker = opt_info["program_id_to_worker"][program_id]
for i in worker.get_desc().dense_table:
if i.table_id in dense_table_set:
dense_table = pull_thread.dense_table.add()
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.table_id = \
i.table_id
sparse_len = len(worker.get_desc().sparse_table)
for i in range(sparse_len):
sparse_table = downpour.sparse_table.add()
sparse_table.table_id = worker.get_desc().sparse_table[
i].table_id
sparse_table.sparse_key_name.extend(worker.get_desc()
.sparse_table[i].slot_key)
sparse_table.sparse_value_name.extend(worker.get_desc(
).sparse_table[i].slot_value)
sparse_table.sparse_grad_name.extend(worker.get_desc(
).sparse_table[i].slot_gradient)
sparse_table.fea_dim = \
self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
i].accessor.fea_dim
# not use emb_dim
sparse_table.emb_dim = -1
# not use hard code click
sparse_table.label_var_name = ""
for i in worker.get_desc().dense_table:
if i.table_id in dense_table_set:
dense_table = downpour.dense_table.add()
dense_table.table_id = i.table_id
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.dense_grad_name.extend(
i.dense_gradient_variable_name)
downpour.skip_ops.extend(worker.get_desc().skip_op)
else:
dense_table_config = opt_info.get("dense_table_config")
print("device worker dense_table_config:", dense_table_config)
for table_id, varnames in dense_table_config.items():
dense_table = pull_thread.dense_table.add()
dense_table.dense_value_name.extend(varnames)
dense_table.table_id = table_id
if self._infer:
downpour.skip_ops.extend(
["push_sparse", "push_sparse_v2", "push_dense"])
class DownpourSGD(DeviceWorker):
"""
DownpourSGD is a kind of distributed SGD algorithm.
......
......@@ -57,8 +57,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir'])
self.ps_launch()
file1 = '/ps_log/async_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/async_run_minimize_debug:_1_worker_main.prototxt'
file1 = './ps_log/async_run_minimize_debug:_0_worker_main.prototxt'
file2 = './ps_log/async_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_cpu_async passed!')
else:
......@@ -79,8 +79,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir'])
self.ps_launch()
'''
file1 = '/ps_log/sync_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/sync_run_minimize_debug:_1_worker_main.prototxt'
file1 = './ps_log/sync_run_minimize_debug:_0_worker_main.prototxt'
file2 = './ps_log/sync_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_cpu_sync passed!')
else:
......@@ -102,8 +102,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir'])
self.ps_launch()
file1 = '/ps_log/geo_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/geo_run_minimize_debug:_1_worker_main.prototxt'
file1 = './ps_log/geo_run_minimize_debug:_0_worker_main.prototxt'
file2 = './ps_log/geo_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_cpu_geo passed!')
else:
......@@ -130,10 +130,10 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir'])
self.ps_launch('heter-ps')
'''
file1 = '/ps_log/heter_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/heter_run_minimize_debug:_1_worker_main.prototxt'
file3 = '/ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt'
file4 = '/ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt'
file1 = './ps_log/heter_run_minimize_debug:_0_worker_main.prototxt'
file2 = './ps_log/heter_run_minimize_debug:_1_worker_main.prototxt'
file3 = './ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt'
file4 = './ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt'
if self.check(file1, file2) and self.check(file3, file4):
logger.info('test_ps_optimizer_minimize_heter passed!')
else:
......@@ -155,8 +155,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir'])
self.ps_launch("gpu-ps")
file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt'
file1 = './ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
file2 = './ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_gpu passed!')
else:
......@@ -180,8 +180,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir'])
self.ps_launch("cpu-ps")
file1 = '/ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt'
file2 = '/ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt'
file1 = './ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt'
file2 = './ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt'
if self.check(file1, file2):
logger.info('test_append_send_ops_pass passed!')
else:
......@@ -192,5 +192,5 @@ class TestPsTrainerPass(PsPassTestBase):
if __name__ == '__main__':
remove_path_if_exists('/ps_log')
remove_path_if_exists('./ps_log')
unittest.main()
......@@ -50,11 +50,11 @@ class MKLDNNBF16ActivationOp(object):
self.dtype = np.uint16
self.init_data()
self.config()
self.set_attrs()
self.out = self.op_forward(self.x)
self.inputs = {'X': convert_float_to_uint16(self.x)}
self.outputs = {'Out': self.out}
self.set_attrs()
def calculate_grads(self):
self.dx = self.op_grad(self.out, self.x)
......@@ -162,5 +162,110 @@ class TestMKLDNNMishBF16Op(MKLDNNBF16ActivationOp, TestActivation):
return dout * ((np.exp(x) * omega) / delta**2)
class TestMKLDNNRelu6BF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "relu6"
def op_forward(self, x):
return np.clip(x, 0, 6)
def op_grad(self, dout, x):
return np.where((x > 0) & (x <= 6), dout, 0)
class TestMKLDNNLeakyReluBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "leaky_relu"
def op_forward(self, x):
return np.where(x > 0, x, self.alpha * x)
def op_grad(self, dout, x):
return np.where(x > 0, dout, self.alpha * dout)
def set_attrs(self):
self.alpha = 0.2
self.attrs = {"use_mkldnn": True, "alpha": self.alpha}
class TestMKLDNNSwishBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "swish"
def expit(self, val):
return 1 / (1 + np.exp(-self.beta * val))
def op_forward(self, x):
return x * self.expit(x)
def op_grad(self, dout, x):
return dout * self.expit(x) * (1 + self.beta * x * (1 - self.expit(x)))
def set_attrs(self):
self.beta = 0.2
self.attrs = {"use_mkldnn": True, "beta": self.beta}
class TestMKLDNNHardSwishBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "hard_swish"
def op_forward(self, x):
result = np.where(x < -3, 0, x)
return np.where(result > 3, result, result * (result + 3) / 6)
def op_grad(self, dout, x):
result = np.where(x < -3, 0, x)
return np.where(result > 3, dout, dout * (2 * x + 3) / 6)
class TestMKLDNNTanhBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "tanh"
def op_forward(self, x):
return np.tanh(x)
def op_grad(self, dout, x):
return dout * (1 - np.tanh(x)**2)
class TestMKLDNNAbsBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "abs"
def op_forward(self, x):
return np.absolute(x)
def op_grad(self, dout, x):
return dout * np.sign(x)
class TestMKLDNNEluBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "elu"
def op_forward(self, x):
return np.where(x > 0, x, self.alpha * (np.exp(x) - 1))
def op_grad(self, dout, x):
return np.where(x > 0, dout, dout * self.alpha * np.exp(x))
def set_attrs(self):
self.alpha = 0.2
self.attrs = {"use_mkldnn": True, "alpha": self.alpha}
class TestMKLDNNExpBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "exp"
def op_forward(self, x):
return np.exp(x)
def op_grad(self, dout, x):
return dout * np.exp(x)
if __name__ == '__main__':
unittest.main()
......@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest
import numpy as np
from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool
from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
......@@ -30,23 +30,32 @@ def ref_softplus(x, beta, threshold):
return out
@OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)),
"GPU is not supported")
@OpTestTool.skip_if_not_cpu_bf16()
class TestSoftplusOneDNNOp(OpTest):
def setUp(self):
self.op_type = "softplus"
self.beta = 1
self.threshold = 20
self.config()
self.set_dtype()
self.attrs = {'use_mkldnn': True, 'beta': self.beta}
self.inputs = {'X': np.random.random(self.x_shape).astype(np.float32)}
self.x = np.random.random(self.x_shape)
self.out = ref_softplus(self.x, self.beta, self.threshold)
if self.dtype != np.float32:
self.x = convert_float_to_uint16(self.x)
self.inputs = {'X': self.out}
self.outputs = {
'Out': ref_softplus(self.inputs['X'], self.beta, self.threshold)
'Out': ref_softplus(self.out, self.beta, self.threshold)
}
def config(self):
self.x_shape = (10, 10)
def set_dtype(self):
self.dtype = np.float32
def test_check_output(self):
self.check_output()
......@@ -73,6 +82,27 @@ class TestSoftplus3DExtendedFunctorOneDNNOp(TestSoftplusOneDNNOp):
self.beta = 0.4
class TestSoftplusBF16OneDNNOp(TestSoftplusOneDNNOp):
def set_dtype(self):
self.dtype = np.uint16
class TestSoftplus4DBF16OneDNNOp(TestSoftplus4DOneDNNOp):
def set_dtype(self):
self.dtype = np.uint16
class TestSoftplus6DBF16OneDNNOp(TestSoftplus6DOneDNNOp):
def set_dtype(self):
self.dtype = np.uint16
class TestSoftplus3DExtendedFunctorBF16OneDNNOp(
TestSoftplus3DExtendedFunctorOneDNNOp):
def set_dtype(self):
self.dtype = np.uint16
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -26,7 +26,7 @@ import paddle
from paddle.fluid.tests.unittests.distributed_passes.ps_pass_test_base import *
from paddle.distributed.ps.utils.public import logger, ps_log_root_dir
from ps_dnn_trainer import DnnTrainer
from paddle.distributed.fleet.proto import ps_pb2
import paddle.distributed.fleet.proto.the_one_ps_pb2 as ps_pb2
from google.protobuf import text_format
......
......@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest
import paddle
import os
import paddle.distributed.fleet.base.role_maker as role_maker
import time
......
......@@ -11,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest
import paddle
import os
import paddle.distributed.fleet.base.role_maker as role_maker
import time
......
......@@ -11,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest
import paddle
import os
import paddle.distributed.fleet.base.role_maker as role_maker
import time
......
......@@ -309,7 +309,7 @@ class TestFleetBase(unittest.TestCase):
(tr1_proc, tr1_out, tr1_err, tr1_out_log, tr1_err_log))
def _run_cluster(self, model, envs):
env = {'GRAD_CLIP': str(self._grad_clip_mode)}
env = {'GRAD_CLIP': str(self._grad_clip_mode), 'WITH_DISTRIBUTE': 'ON'}
python_path = self._python_interp
gloo_path = tempfile.mkdtemp()
......@@ -343,7 +343,8 @@ class TestFleetBase(unittest.TestCase):
tr1_proc, tr1_out, tr1_err, tr1_out_log, tr1_err_log = tr1
# Wait until trainer process terminate
time_out = 120
#time_out = 120
time_out = 60
cur_time = 0
while True:
......
......@@ -51,8 +51,9 @@ class TestDistMnistAsyncInMemoryDataset2x2(TestFleetBase):
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
def test_dist_train(self):
self.check_with_place(
"dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
# self.check_with_place(
# "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print('recover later')
class TestDistMnistAsync2x2(TestFleetBase):
......@@ -85,8 +86,9 @@ class TestDistMnistAsync2x2(TestFleetBase):
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
def test_dist_train(self):
self.check_with_place(
"dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
# self.check_with_place(
# "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print('recover later')
class TestDistCtrHalfAsync2x2(TestFleetBase):
......@@ -122,8 +124,9 @@ class TestDistCtrHalfAsync2x2(TestFleetBase):
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
def test_dist_train(self):
self.check_with_place(
"dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
# self.check_with_place(
# "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print('recover later')
if __name__ == "__main__":
......
......@@ -52,8 +52,9 @@ class TestDistMnistSync2x2(TestFleetBase):
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
def test_dist_train(self):
self.check_with_place(
"dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
# self.check_with_place(
# "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print('recover later')
# @unittest.skip(reason="Skip unstable ut, reader need to be rewrite")
......@@ -91,8 +92,9 @@ class TestDistMnistAsyncDataset2x2(TestFleetBase):
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
def test_dist_train(self):
self.check_with_place(
"dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
# self.check_with_place(
# "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print('recover later')
if __name__ == "__main__":
......
......@@ -15,6 +15,7 @@
from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest
import paddle
import paddle.fluid as fluid
......
......@@ -13,14 +13,14 @@
# limitations under the License.
from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import paddle.fluid as fluid
import paddle.distributed.fleet.base.role_maker as role_maker
import paddle.distributed.fleet as fleet
import unittest
import paddle
import os
paddle.enable_static()
# For Net
......@@ -74,11 +74,12 @@ class TestExponentialDecay(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(loss)
optimizer.minimize([loss])
fleet.init_server()
if __name__ == '__main__':
os.environ["GLOG_v"] = "4"
os.environ["GLOG_logtostderr"] = "1"
unittest.main()
......@@ -15,6 +15,8 @@
from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest
import tempfile
import shutil
......
......@@ -15,6 +15,8 @@
from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest
import tempfile
import shutil
......
......@@ -13,10 +13,12 @@
# limitations under the License.
from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
import paddle.fluid as fluid
import os
import unittest
import paddle
paddle.enable_static()
......
......@@ -13,10 +13,11 @@
# limitations under the License.
from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
import paddle.fluid as fluid
import os
import unittest
import paddle
paddle.enable_static()
......
......@@ -13,10 +13,11 @@
# limitations under the License.
from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
import paddle.fluid as fluid
import os
import unittest
import paddle
paddle.enable_static()
......
......@@ -23,7 +23,7 @@ local_logger = get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer, HeterXpuTrainer, PSGPUTrainer, HeterPipelineTrainer
from .device_worker import Hogwild, DownpourSGD, Section, DownpourSGDOPT, HeterSection
from .device_worker import Hogwild, DownpourSGD, DownpourLite, Section, DownpourSGDOPT, HeterSection
from .framework import Variable
from multiprocessing import Process, Manager
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册