提交 5dcfb699 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

...@@ -90,9 +90,9 @@ endif() ...@@ -90,9 +90,9 @@ endif()
if (WITH_ASCEND_CL) if (WITH_ASCEND_CL)
macro(find_ascend_toolkit_version ascend_toolkit_version_info) macro(find_ascend_toolkit_version ascend_toolkit_version_info)
file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS) file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS)
string(REGEX MATCH "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}") string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}") string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
string(REGEX REPLACE "[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION}) string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}") add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}")
if(NOT ASCEND_TOOLKIT_VERSION) if(NOT ASCEND_TOOLKIT_VERSION)
set(ASCEND_TOOLKIT_VERSION "???") set(ASCEND_TOOLKIT_VERSION "???")
......
...@@ -238,7 +238,7 @@ int32_t BrpcPsClient::initialize() { ...@@ -238,7 +238,7 @@ int32_t BrpcPsClient::initialize() {
std::thread(std::bind(&BrpcPsClient::push_dense_task_consume, this)); std::thread(std::bind(&BrpcPsClient::push_dense_task_consume, this));
// for debug // for debug
// _print_thread = // _print_thread =
// std::thread(std::bind(&BrpcPsClient::print_queue_size_thread, this)); // std::thread(std::bind(&BrpcPsClient::print_queue_size_thread, this));
return 0; return 0;
} }
...@@ -1315,11 +1315,11 @@ std::future<int32_t> BrpcPsClient::push_sparse(size_t table_id, ...@@ -1315,11 +1315,11 @@ std::future<int32_t> BrpcPsClient::push_sparse(size_t table_id,
CostTimer parse_timer("pserver_client_push_sparse_parse"); CostTimer parse_timer("pserver_client_push_sparse_parse");
int push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size(); int push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
while (push_sparse_async_num > FLAGS_pserver_max_async_call_num) { while (push_sparse_async_num > FLAGS_pserver_max_async_call_num) {
// LOG(INFO) << "push_sparse Waiting for async_call_num comsume, task_num:" // LOG(INFO) << "push_sparse Waiting for async_call_num comsume,
// << push_sparse_async_num << ", max_task_limit:" << // task_num:"
// FLAGS_pserver_max_async_call_num; // << push_sparse_async_num
// << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
usleep(5000); // 5ms usleep(5000); // 5ms
// push_sparse_async_num = _push_sparse_task_queue_map[table_id]->size();
push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size(); push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
} }
auto put_timer = std::make_shared<CostTimer>("client_push_sparse_put"); auto put_timer = std::make_shared<CostTimer>("client_push_sparse_put");
...@@ -1381,8 +1381,7 @@ void BrpcPsClient::push_sparse_task_consume() { ...@@ -1381,8 +1381,7 @@ void BrpcPsClient::push_sparse_task_consume() {
::ThreadPool async_push_sparse_shard_threads( ::ThreadPool async_push_sparse_shard_threads(
FLAGS_pserver_sparse_merge_thread); FLAGS_pserver_sparse_merge_thread);
while (_running) { while (_running) {
platform::Timer timeline; auto async_start_time_ms = butil::gettimeofday_ms();
timeline.Start();
// 所有sparseTable的pushTask 进行处理 // 所有sparseTable的pushTask 进行处理
for (auto &push_sparse_task_itr : _push_sparse_task_queue_map) { for (auto &push_sparse_task_itr : _push_sparse_task_queue_map) {
auto table_id = push_sparse_task_itr.first; auto table_id = push_sparse_task_itr.first;
...@@ -1497,9 +1496,8 @@ void BrpcPsClient::push_sparse_task_consume() { ...@@ -1497,9 +1496,8 @@ void BrpcPsClient::push_sparse_task_consume() {
std::vector<std::future<int>>().swap(merge_status); std::vector<std::future<int>>().swap(merge_status);
} }
} }
timeline.Pause(); auto wait_ms = FLAGS_pserver_async_push_sparse_interval_ms -
auto wait_ms = (butil::gettimeofday_ms() - async_start_time_ms);
FLAGS_pserver_async_push_sparse_interval_ms - (timeline.ElapsedMS());
if (wait_ms > 0) { if (wait_ms > 0) {
usleep(wait_ms * 1000); usleep(wait_ms * 1000);
} }
...@@ -1661,9 +1659,10 @@ std::future<int32_t> BrpcPsClient::push_dense(const Region *regions, ...@@ -1661,9 +1659,10 @@ std::future<int32_t> BrpcPsClient::push_dense(const Region *regions,
std::make_shared<CostTimer>("pserver_client_push_dense_parse"); std::make_shared<CostTimer>("pserver_client_push_dense_parse");
int push_dense_async_num = _push_dense_task_queue_map[table_id]->Size(); int push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
while (push_dense_async_num > FLAGS_pserver_max_async_call_num) { while (push_dense_async_num > FLAGS_pserver_max_async_call_num) {
LOG(INFO) << "push_dense Waiting for async_call_num comsume, task_num:" // LOG(INFO) << "push_dense Waiting for async_call_num comsume,
<< push_dense_async_num // task_num:"
<< ", max_task_limit:" << FLAGS_pserver_max_async_call_num; // << push_dense_async_num
// << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
usleep(5000); // 5ms usleep(5000); // 5ms
push_dense_async_num = _push_dense_task_queue_map[table_id]->Size(); push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
} }
...@@ -1701,8 +1700,7 @@ void BrpcPsClient::push_dense_task_consume() { ...@@ -1701,8 +1700,7 @@ void BrpcPsClient::push_dense_task_consume() {
static bool scale_gradient = FLAGS_pserver_scale_gradient_by_merge; static bool scale_gradient = FLAGS_pserver_scale_gradient_by_merge;
::ThreadPool async_merge_dense_threads(10); ::ThreadPool async_merge_dense_threads(10);
while (_running) { while (_running) {
platform::Timer timeline; auto async_start_time_ms = butil::gettimeofday_ms();
timeline.Start();
for (auto &task_queue_itr : _push_dense_task_queue_map) { for (auto &task_queue_itr : _push_dense_task_queue_map) {
auto &task_queue = task_queue_itr.second; auto &task_queue = task_queue_itr.second;
auto queue_size = task_queue->Size(); auto queue_size = task_queue->Size();
...@@ -1791,9 +1789,8 @@ void BrpcPsClient::push_dense_task_consume() { ...@@ -1791,9 +1789,8 @@ void BrpcPsClient::push_dense_task_consume() {
push_dense_raw_gradient(task_ptr, total_send_data, total_send_data_size, push_dense_raw_gradient(task_ptr, total_send_data, total_send_data_size,
closure); closure);
} }
timeline.Pause(); auto wait_ms = FLAGS_pserver_async_push_dense_interval_ms -
auto wait_ms = (butil::gettimeofday_ms() - async_start_time_ms);
FLAGS_pserver_async_push_dense_interval_ms - (timeline.ElapsedMS());
if (wait_ms > 0) { if (wait_ms > 0) {
usleep(wait_ms * 1000); usleep(wait_ms * 1000);
} }
......
...@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include <google/protobuf/text_format.h> #include <google/protobuf/text_format.h>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
...@@ -66,34 +65,9 @@ std::shared_ptr<Communicator> Communicator::communicator_(nullptr); ...@@ -66,34 +65,9 @@ std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
void Communicator::InitBrpcClient( void Communicator::InitBrpcClient(
const std::string &dist_desc, const std::string &dist_desc,
const std::vector<std::string> &host_sign_list) { const std::vector<std::string> &host_sign_list) {
// not used, just for psclient's init auto fleet = paddle::distributed::FleetWrapper::GetInstance();
std::map<uint64_t, std::vector<paddle::distributed::Region>>
_dense_pull_regions;
for (auto &iter : recv_varname_to_ctx_) {
auto tid = iter.first;
auto var_names = iter.second;
auto &regions = _dense_pull_regions[tid];
regions.reserve(var_names.size());
for (auto &t : var_names) {
Variable *var = recv_scope_->FindVar(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>();
float *w = tensor->data<float>();
paddle::distributed::Region reg(w, tensor->numel());
regions.emplace_back(std::move(reg));
}
}
if (_worker_ptr.get() == nullptr) { if (_worker_ptr.get() == nullptr) {
google::protobuf::TextFormat::ParseFromString(dist_desc, &_ps_param); _worker_ptr = fleet->worker_ptr_;
init_gflag(_ps_param.init_gflags());
servers_ = host_sign_list.size();
_ps_env = paddle::distributed::PaddlePSEnvironment();
_ps_env.set_ps_servers(&host_sign_list, servers_);
_worker_ptr = std::unique_ptr<paddle::distributed::PSClient>(
paddle::distributed::PSClientFactory::create(_ps_param));
_worker_ptr->configure(_ps_param, _dense_pull_regions, _ps_env,
trainer_id_);
} }
return; return;
} }
...@@ -146,11 +120,11 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames, ...@@ -146,11 +120,11 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,
for (auto &t : varnames) { for (auto &t : varnames) {
Variable *var = scope->FindVar(t); Variable *var = scope->FindVar(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>(); LoDTensor *tensor = var->GetMutable<LoDTensor>();
VLOG(1) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? " VLOG(3) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
<< platform::is_gpu_place(tensor->place()); << platform::is_gpu_place(tensor->place());
float *temp_recv_data = tensor->mutable_data<float>(platform::CPUPlace()); float *temp_recv_data = tensor->mutable_data<float>(platform::CPUPlace());
VLOG(1) << "AsyncCommunicator::RpcRecvDense Var " << t << " table_id " VLOG(3) << "AsyncCommunicator::RpcRecvDense Var " << t << " table_id "
<< table_id << " Temp_data[0] " << temp_recv_data[0] << table_id << " Temp_data[0] " << temp_recv_data[0]
<< " Temp_data[-1] " << temp_recv_data[tensor->numel() - 1]; << " Temp_data[-1] " << temp_recv_data[tensor->numel() - 1];
if (platform::is_gpu_place(tensor->place())) { if (platform::is_gpu_place(tensor->place())) {
...@@ -481,7 +455,7 @@ void AsyncCommunicator::RecvNoBarrier() { ...@@ -481,7 +455,7 @@ void AsyncCommunicator::RecvNoBarrier() {
for (auto &t : var_names) { for (auto &t : var_names) {
Variable *var = recv_scope_->FindVar(t); Variable *var = recv_scope_->FindVar(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>(); LoDTensor *tensor = var->GetMutable<LoDTensor>();
VLOG(1) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? " VLOG(3) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
<< platform::is_gpu_place(tensor->place()); << platform::is_gpu_place(tensor->place());
if (platform::is_gpu_place(tensor->place())) { if (platform::is_gpu_place(tensor->place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -653,7 +627,7 @@ void AsyncCommunicator::PushSparseFromTensorAsync( ...@@ -653,7 +627,7 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0]; input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0];
if (batch_size == -1) { if (batch_size == -1) {
batch_size = cur_batch_size; batch_size = cur_batch_size;
} else { } else if (batch_size != cur_batch_size) {
// CHECK(batch_size == cur_batch_size); // NOLINT // CHECK(batch_size == cur_batch_size); // NOLINT
batch_size_consist = false; batch_size_consist = false;
break; break;
...@@ -676,7 +650,8 @@ void AsyncCommunicator::PushSparseFromTensorAsync( ...@@ -676,7 +650,8 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
size_t output_len = 0; size_t output_len = 0;
size_t input_idx = 0; size_t input_idx = 0;
VLOG(2) << "fleet.cc::emb_dim: " << fea_dim; VLOG(2) << "fleet.cc::emb_dim: " << fea_dim << " batch_size: " << batch_size
<< " batch_size_consist: " << batch_size_consist;
// TODO(zhaocaibei123): check type of show/clk is int? float? uint64? // TODO(zhaocaibei123): check type of show/clk is int? float? uint64?
// const long int* show_tensor = shows->data<int64_t>(); // const long int* show_tensor = shows->data<int64_t>();
...@@ -687,13 +662,14 @@ void AsyncCommunicator::PushSparseFromTensorAsync( ...@@ -687,13 +662,14 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
for (size_t index = 0; index < inputs->size(); ++index) { for (size_t index = 0; index < inputs->size(); ++index) {
framework::LoDTensor *g_tensor = outputs->at(index); framework::LoDTensor *g_tensor = outputs->at(index);
float *g = g_tensor->data<float>(); float *g = g_tensor->data<float>();
// no cvm
if (batch_size_consist) { // TODO(zhaocaibei123): add config if (batch_size_consist) { // TODO(zhaocaibei123): add config
// scale_sparse_gradient_with_batch_size_ // scale_sparse_gradient_with_batch_size_
Eigen::Map< Eigen::Map<
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
g_mat(g, g_tensor->numel() / fea_dim, fea_dim); g_mat(g, g_tensor->numel() / fea_dim, fea_dim);
g_mat.rightCols(fea_dim) *= batch_size; g_mat.rightCols(fea_dim - 2) *=
batch_size; // hard code here, because of cvm_grad op
} }
const framework::LoDTensor *tensor = inputs->at(index); const framework::LoDTensor *tensor = inputs->at(index);
...@@ -710,16 +686,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync( ...@@ -710,16 +686,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
continue; continue;
} }
push_keys.emplace_back(real_id); push_keys.emplace_back(real_id);
push_values.emplace_back(fea_dim + 3); push_values.emplace_back(fea_dim + 1);
// slot show clk grad... consistent with CtrCommonPushValue defined in // slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h // ctr_accessor.h
push_values.back()[0] = 2; // TODO(zhaocaibei123): slot push_values.back()[0] = 2; // TODO(zhaocaibei123): slot
push_values.back()[1] = // push_values.back()[1] =
(i >= show_size ? 1 : static_cast<float>(show_tensor[i])); // (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
push_values.back()[2] = // push_values.back()[2] =
(i >= clk_size ? 0 : static_cast<float>(clk_tensor[i])); // (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
float *data = push_values.back().data() + 3; float *data = push_values.back().data() + 1; // hard code here
memcpy(data, g + output_len, sizeof(float) * fea_dim); memcpy(data, g + output_len, sizeof(float) * fea_dim);
...@@ -733,16 +709,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync( ...@@ -733,16 +709,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
continue; continue;
} }
push_keys.emplace_back(real_id); push_keys.emplace_back(real_id);
push_values.emplace_back(fea_dim + 3); push_values.emplace_back(fea_dim + 1);
// slot show clk grad... consistent with CtrCommonPushValue defined in // slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h // ctr_accessor.h
push_values.back()[0] = 2; // TODO(zhaocaibei123): slot push_values.back()[0] = 2; // TODO(zhaocaibei123): slot
push_values.back()[1] = // push_values.back()[1] =
(i >= show_size ? 1 : static_cast<float>(show_tensor[i])); // (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
push_values.back()[2] = // push_values.back()[2] =
(i >= clk_size ? 0 : static_cast<float>(clk_tensor[i])); // (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
float *data = push_values.back().data() + 3; float *data = push_values.back().data() + 1;
memcpy(data, g + output_len, sizeof(float) * fea_dim); memcpy(data, g + output_len, sizeof(float) * fea_dim);
...@@ -837,7 +813,7 @@ void AsyncCommunicator::Stop() { ...@@ -837,7 +813,7 @@ void AsyncCommunicator::Stop() {
if (!communicator_) { if (!communicator_) {
VLOG(0) << "Communicator is not inited, do nothing"; VLOG(0) << "Communicator is not inited, do nothing";
} else { } else {
_worker_ptr->finalize_worker(); // _worker_ptr->finalize_worker();
VLOG(1) << "client finalize_worker done"; VLOG(1) << "client finalize_worker done";
if (recv_thread_) { if (recv_thread_) {
VLOG(1) << "stop recv thread"; VLOG(1) << "stop recv thread";
......
...@@ -360,13 +360,13 @@ class Communicator { ...@@ -360,13 +360,13 @@ class Communicator {
PSClient *GetPsClient() { return _worker_ptr.get(); } PSClient *GetPsClient() { return _worker_ptr.get(); }
std::unique_ptr<paddle::distributed::PSClient> GetPsClientPtr() { std::shared_ptr<paddle::distributed::PSClient> GetPsClientPtr() {
return std::move(_worker_ptr); return std::move(_worker_ptr);
} }
RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; } RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; }
std::unique_ptr<PSClient> _worker_ptr; // pointer to worker std::shared_ptr<PSClient> _worker_ptr; // pointer to worker
protected: protected:
bool running_ = false; bool running_ = false;
......
...@@ -43,11 +43,12 @@ set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPI ...@@ -43,11 +43,12 @@ set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPI
set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(ctr_double_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(ctr_double_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(sparse_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(downpour_ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(downpour_ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto) cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
cc_library(ctr_double_accessor SRCS ctr_double_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule) cc_library(ctr_double_accessor SRCS ctr_double_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule) cc_library(ctr_accessor SRCS ctr_accessor.cc sparse_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(downpour_ctr_accessor SRCS downpour_ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule) cc_library(downpour_ctr_accessor SRCS downpour_ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table) cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)
......
...@@ -115,6 +115,8 @@ int32_t CommonDenseTable::initialize_optimizer() { ...@@ -115,6 +115,8 @@ int32_t CommonDenseTable::initialize_optimizer() {
// optimizer_->set_global_lr(_global_lr); //no use // optimizer_->set_global_lr(_global_lr); //no use
} else if (name == "sum") { } else if (name == "sum") {
optimizer_ = std::make_shared<DSUM>(common, &values_); optimizer_ = std::make_shared<DSUM>(common, &values_);
} else if (name == "summary") {
optimizer_ = std::make_shared<DSummary>(common, &values_);
} else { } else {
VLOG(0) << "init optimizer failed"; VLOG(0) << "init optimizer failed";
} }
...@@ -339,19 +341,27 @@ int32_t CommonDenseTable::save(const std::string& path, ...@@ -339,19 +341,27 @@ int32_t CommonDenseTable::save(const std::string& path,
auto common = _config.common(); auto common = _config.common();
int size = static_cast<int>(common.params().size()); int size = static_cast<int>(common.params().size());
std::ostringstream os; if (_config.common().name() == "summary") {
for (int x = 0; x < size; ++x) { for (int x = 0; x < param_dim_; ++x) {
auto& varname = common.params()[x]; result_buffer_param[x].emplace_back(
auto& dim = common.dims()[x]; std::to_string(values_[param_idx_][x]));
VLOG(0) << "CommonDenseTable::save dim " << x << " size: " << dim; }
for (int y = 0; y < dim; ++y) {
os.clear(); } else {
os.str(""); std::ostringstream os;
os << values_[x][y]; for (int x = 0; x < size; ++x) {
if (dim == param_dim_) { auto& varname = common.params()[x];
result_buffer_param[y].emplace_back(std::move(os.str())); auto& dim = common.dims()[x];
} else { VLOG(3) << "CommonDenseTable::save dim " << x << " size: " << dim;
result_buffer_fixed_len.emplace_back(std::move(os.str())); for (int y = 0; y < dim; ++y) {
os.clear();
os.str("");
os << values_[x][y];
if (dim == param_dim_) {
result_buffer_param[y].emplace_back(std::move(os.str()));
} else {
result_buffer_fixed_len.emplace_back(std::move(os.str()));
}
} }
} }
} }
......
...@@ -65,7 +65,7 @@ size_t CtrCommonAccessor::mf_size() { ...@@ -65,7 +65,7 @@ size_t CtrCommonAccessor::mf_size() {
// pull value // pull value
size_t CtrCommonAccessor::select_dim() { size_t CtrCommonAccessor::select_dim() {
auto embedx_dim = _config.embedx_dim(); auto embedx_dim = _config.embedx_dim();
return 1 + embedx_dim; return 3 + embedx_dim;
} }
size_t CtrCommonAccessor::select_dim_size(size_t dim) { return sizeof(float); } size_t CtrCommonAccessor::select_dim_size(size_t dim) { return sizeof(float); }
...@@ -213,6 +213,10 @@ int32_t CtrCommonAccessor::select(float** select_values, const float** values, ...@@ -213,6 +213,10 @@ int32_t CtrCommonAccessor::select(float** select_values, const float** values,
for (size_t value_item = 0; value_item < num; ++value_item) { for (size_t value_item = 0; value_item < num; ++value_item) {
float* select_value = select_values[value_item]; float* select_value = select_values[value_item];
const float* value = values[value_item]; const float* value = values[value_item];
select_value[CtrCommonPullValue::show_index()] =
value[common_feature_value.show_index()];
select_value[CtrCommonPullValue::click_index()] =
value[common_feature_value.click_index()];
select_value[CtrCommonPullValue::embed_w_index()] = select_value[CtrCommonPullValue::embed_w_index()] =
value[common_feature_value.embed_w_index()]; value[common_feature_value.embed_w_index()];
memcpy(select_value + CtrCommonPullValue::embedx_w_index(), memcpy(select_value + CtrCommonPullValue::embedx_w_index(),
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
// DownpourUnitAccessor
class CtrCommonAccessor : public ValueAccessor { class CtrCommonAccessor : public ValueAccessor {
public: public:
struct CtrCommonFeatureValue { struct CtrCommonFeatureValue {
...@@ -106,15 +107,25 @@ class CtrCommonAccessor : public ValueAccessor { ...@@ -106,15 +107,25 @@ class CtrCommonAccessor : public ValueAccessor {
struct CtrCommonPullValue { struct CtrCommonPullValue {
/* /*
float show;
float click;
float embed_w; float embed_w;
std::vector<float> embedx_w; std::vector<float> embedx_w;
*/ */
static int dim(int embedx_dim) { return 1 + embedx_dim; } static int dim(int embedx_dim) { return 3 + embedx_dim; }
static int dim_size(size_t dim) { return sizeof(float); } static int dim_size(size_t dim) { return sizeof(float); }
static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); }
static int embed_w_index() { return 0; } static int show_index() { return 0; }
static int embedx_w_index() { return 1; } static int click_index() { return 1; }
static int embed_w_index() { return 2; }
static int embedx_w_index() { return 3; }
static float& show(float* val) {
return val[CtrCommonPullValue::show_index()];
}
static float& click(float* val) {
return val[CtrCommonPullValue::click_index()];
}
static float& embed_w(float* val) { static float& embed_w(float* val) {
return val[CtrCommonPullValue::embed_w_index()]; return val[CtrCommonPullValue::embed_w_index()];
} }
......
...@@ -196,26 +196,19 @@ class DAdamD2Sum : public DenseOptimizer { ...@@ -196,26 +196,19 @@ class DAdamD2Sum : public DenseOptimizer {
for (int x = 0; x < static_cast<int>(names.size()); ++x) { for (int x = 0; x < static_cast<int>(names.size()); ++x) {
if (names[x] == "LearningRate") { if (names[x] == "LearningRate") {
learning_rate = (*values)[x].data(); learning_rate = (*values)[x].data();
} } else if (names[x] == "Param") {
if (names[x] == "Param") {
param = (*values)[x].data(); param = (*values)[x].data();
} } else if (names[x] == "Moment") {
if (names[x] == "Moment") {
mom_velocity = (*values)[x].data(); mom_velocity = (*values)[x].data();
} } else if (names[x] == "G2Sum") {
if (names[x] == "G2Sum") {
ada_g2sum = (*values)[x].data(); ada_g2sum = (*values)[x].data();
} } else if (names[x] == "D2Sum") {
if (names[x] == "D2Sum") {
ada_d2sum = (*values)[x].data(); ada_d2sum = (*values)[x].data();
} } else if (names[x] == "MomentDecayRate") {
if (names[x] == "MomentDecayRate") {
mom_decay_rate = (*values)[x].data(); mom_decay_rate = (*values)[x].data();
} } else if (names[x] == "AdaDecayRate") {
if (names[x] == "AdaDecayRate") {
ada_decay_rate = (*values)[x].data(); ada_decay_rate = (*values)[x].data();
} } else if (names[x] == "AdaEpsilon") {
if (names[x] == "AdaEpsilon") {
ada_epsilon = (*values)[x].data(); ada_epsilon = (*values)[x].data();
} }
} }
...@@ -268,5 +261,34 @@ class DAdamD2Sum : public DenseOptimizer { ...@@ -268,5 +261,34 @@ class DAdamD2Sum : public DenseOptimizer {
float* ada_epsilon; float* ada_epsilon;
}; };
// for data_norm
class DSummary : public DenseOptimizer {
public:
explicit DSummary(const CommonAccessorParameter& accessor,
std::vector<std::vector<float>>* values) {
auto& names = accessor.params();
for (int x = 0; x < static_cast<int>(names.size()); ++x) {
if (names[x] == "Param") {
param = (*values)[x].data();
} else if (names[x] == "SummaryDecayRate") {
summary_decay_rate = (*values)[x].data();
}
}
}
void update(const float* update_values, size_t num, int begin,
int end) override {
auto update_numel = end - begin;
Eigen::Map<Eigen::MatrixXf> mat_w(param + begin, 1, update_numel);
Eigen::Map<const Eigen::MatrixXf> mat_grad(update_values + begin, 1,
update_numel);
mat_w = mat_w * summary_decay_rate_d + mat_grad;
}
float* summary_decay_rate;
double summary_decay_rate_d = 0.999999;
float* param;
};
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
#include <gflags/gflags.h>
#include "glog/logging.h"
#include "paddle/fluid/string/string_helper.h"
namespace paddle {
namespace distributed {
int SparseAccessor::initialize() {
auto name = _config.embed_sgd_param().name();
_embed_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
_embed_sgd_rule->load_config(_config.embed_sgd_param(), 1);
name = _config.embedx_sgd_param().name();
_embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
_embedx_sgd_rule->load_config(_config.embedx_sgd_param(),
_config.embedx_dim());
sparse_feature_value.embed_sgd_dim = _embed_sgd_rule->dim();
sparse_feature_value.embedx_dim = _config.embedx_dim();
sparse_feature_value.embedx_sgd_dim = _embedx_sgd_rule->dim();
_show_click_decay_rate = _config.ctr_accessor_param().show_click_decay_rate();
return 0;
}
void SparseAccessor::GetTableInfo(AccessorInfo& info) {
info.dim = dim();
info.size = size();
info.select_dim = select_dim();
info.select_size = select_size();
info.update_dim = update_dim();
info.update_size = update_size();
info.fea_dim = fea_dim();
}
size_t SparseAccessor::dim() { return sparse_feature_value.dim(); }
size_t SparseAccessor::dim_size(size_t dim) {
auto embedx_dim = _config.embedx_dim();
return sparse_feature_value.dim_size(dim, embedx_dim);
}
size_t SparseAccessor::size() { return sparse_feature_value.size(); }
size_t SparseAccessor::mf_size() {
return (_config.embedx_dim() + sparse_feature_value.embedx_sgd_dim) *
sizeof(float); // embedx embedx_g2sum
}
// pull value
size_t SparseAccessor::select_dim() {
auto embedx_dim = _config.embedx_dim();
return 1 + embedx_dim;
}
size_t SparseAccessor::select_dim_size(size_t dim) { return sizeof(float); }
size_t SparseAccessor::select_size() { return select_dim() * sizeof(float); }
// push value
size_t SparseAccessor::update_dim() {
auto embedx_dim = _config.embedx_dim();
return 4 + embedx_dim;
}
size_t SparseAccessor::update_dim_size(size_t dim) { return sizeof(float); }
size_t SparseAccessor::update_size() { return update_dim() * sizeof(float); }
bool SparseAccessor::shrink(float* value) {
auto base_threshold = _config.ctr_accessor_param().base_threshold();
auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
auto delete_after_unseen_days =
_config.ctr_accessor_param().delete_after_unseen_days();
auto delete_threshold = _config.ctr_accessor_param().delete_threshold();
// time_decay first
sparse_feature_value.show(value) *= _show_click_decay_rate;
sparse_feature_value.click(value) *= _show_click_decay_rate;
// shrink after
auto score = show_click_score(sparse_feature_value.show(value),
sparse_feature_value.click(value));
auto unseen_days = sparse_feature_value.unseen_days(value);
if (score < delete_threshold || unseen_days > delete_after_unseen_days) {
return true;
}
return false;
}
bool SparseAccessor::save(float* value, int param) {
auto base_threshold = _config.ctr_accessor_param().base_threshold();
auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
if (param == 2) {
delta_threshold = 0;
}
switch (param) {
// save all
case 0: {
return true;
}
// save xbox delta
case 1:
// save xbox base
case 2: {
if (show_click_score(sparse_feature_value.show(value),
sparse_feature_value.click(value)) >=
base_threshold &&
sparse_feature_value.delta_score(value) >= delta_threshold &&
sparse_feature_value.unseen_days(value) <= delta_keep_days) {
// do this after save, because it must not be modified when retry
if (param == 2) {
sparse_feature_value.delta_score(value) = 0;
}
return true;
} else {
return false;
}
}
// already decayed in shrink
case 3: {
// do this after save, because it must not be modified when retry
// sparse_feature_value.unseen_days(value)++;
return true;
}
// save revert batch_model
case 5: {
return true;
}
default:
return true;
}
}
void SparseAccessor::update_stat_after_save(float* value, int param) {
auto base_threshold = _config.ctr_accessor_param().base_threshold();
auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
if (param == 2) {
delta_threshold = 0;
}
switch (param) {
case 1: {
if (show_click_score(sparse_feature_value.show(value),
sparse_feature_value.click(value)) >=
base_threshold &&
sparse_feature_value.delta_score(value) >= delta_threshold &&
sparse_feature_value.unseen_days(value) <= delta_keep_days) {
sparse_feature_value.delta_score(value) = 0;
}
}
return;
case 3: {
sparse_feature_value.unseen_days(value)++;
}
return;
default:
return;
}
}
int32_t SparseAccessor::create(float** values, size_t num) {
auto embedx_dim = _config.embedx_dim();
for (size_t value_item = 0; value_item < num; ++value_item) {
float* value = values[value_item];
value[sparse_feature_value.unseen_days_index()] = 0;
value[sparse_feature_value.delta_score_index()] = 0;
value[sparse_feature_value.show_index()] = 0;
value[sparse_feature_value.click_index()] = 0;
value[sparse_feature_value.slot_index()] = -1;
_embed_sgd_rule->init_value(
value + sparse_feature_value.embed_w_index(),
value + sparse_feature_value.embed_g2sum_index());
_embedx_sgd_rule->init_value(
value + sparse_feature_value.embedx_w_index(),
value + sparse_feature_value.embedx_g2sum_index(), false);
}
return 0;
}
bool SparseAccessor::need_extend_mf(float* value) {
float show = value[sparse_feature_value.show_index()];
float click = value[sparse_feature_value.click_index()];
float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() +
click * _config.ctr_accessor_param().click_coeff();
return score >= _config.embedx_threshold();
}
bool SparseAccessor::has_mf(size_t size) {
return size > sparse_feature_value.embedx_g2sum_index();
}
// from SparseFeatureValue to SparsePullValue
int32_t SparseAccessor::select(float** select_values, const float** values,
size_t num) {
auto embedx_dim = _config.embedx_dim();
for (size_t value_item = 0; value_item < num; ++value_item) {
float* select_value = select_values[value_item];
const float* value = values[value_item];
select_value[SparsePullValue::embed_w_index()] =
value[sparse_feature_value.embed_w_index()];
memcpy(select_value + SparsePullValue::embedx_w_index(),
value + sparse_feature_value.embedx_w_index(),
embedx_dim * sizeof(float));
}
return 0;
}
// from SparsePushValue to SparsePushValue
// first dim: item
// second dim: field num
int32_t SparseAccessor::merge(float** update_values,
const float** other_update_values, size_t num) {
auto embedx_dim = _config.embedx_dim();
size_t total_dim = SparsePushValue::dim(embedx_dim);
for (size_t value_item = 0; value_item < num; ++value_item) {
float* update_value = update_values[value_item];
const float* other_update_value = other_update_values[value_item];
for (auto i = 0u; i < total_dim; ++i) {
if (i != SparsePushValue::slot_index()) {
update_value[i] += other_update_value[i];
}
}
}
return 0;
}
// from SparsePushValue to SparseFeatureValue
// first dim: item
// second dim: field num
int32_t SparseAccessor::update(float** update_values, const float** push_values,
size_t num) {
auto embedx_dim = _config.embedx_dim();
for (size_t value_item = 0; value_item < num; ++value_item) {
float* update_value = update_values[value_item];
const float* push_value = push_values[value_item];
float push_show = push_value[SparsePushValue::show_index()];
float push_click = push_value[SparsePushValue::click_index()];
float slot = push_value[SparsePushValue::slot_index()];
update_value[sparse_feature_value.show_index()] += push_show;
update_value[sparse_feature_value.click_index()] += push_click;
update_value[sparse_feature_value.slot_index()] = slot;
update_value[sparse_feature_value.delta_score_index()] +=
(push_show - push_click) * _config.ctr_accessor_param().nonclk_coeff() +
push_click * _config.ctr_accessor_param().click_coeff();
update_value[sparse_feature_value.unseen_days_index()] = 0;
_embed_sgd_rule->update_value(
update_value + sparse_feature_value.embed_w_index(),
update_value + sparse_feature_value.embed_g2sum_index(),
push_value + SparsePushValue::embed_g_index());
_embedx_sgd_rule->update_value(
update_value + sparse_feature_value.embedx_w_index(),
update_value + sparse_feature_value.embedx_g2sum_index(),
push_value + SparsePushValue::embedx_g_index());
}
return 0;
}
bool SparseAccessor::create_value(int stage, const float* value) {
// stage == 0, pull
// stage == 1, push
if (stage == 0) {
return true;
} else if (stage == 1) {
// operation
auto show = SparsePushValue::show(const_cast<float*>(value));
auto click = SparsePushValue::click(const_cast<float*>(value));
auto score = show_click_score(show, click);
if (score <= 0) {
return false;
}
if (score >= 1) {
return true;
}
return local_uniform_real_distribution<float>()(local_random_engine()) <
score;
} else {
return true;
}
}
float SparseAccessor::show_click_score(float show, float click) {
auto nonclk_coeff = _config.ctr_accessor_param().nonclk_coeff();
auto click_coeff = _config.ctr_accessor_param().click_coeff();
return (show - click) * nonclk_coeff + click * click_coeff;
}
std::string SparseAccessor::parse_to_string(const float* v, int param) {
thread_local std::ostringstream os;
os.clear();
os.str("");
os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4] << " "
<< v[5];
for (int i = sparse_feature_value.embed_g2sum_index();
i < sparse_feature_value.embedx_w_index(); i++) {
os << " " << v[i];
}
auto show = sparse_feature_value.show(const_cast<float*>(v));
auto click = sparse_feature_value.click(const_cast<float*>(v));
auto score = show_click_score(show, click);
if (score >= _config.embedx_threshold() &&
param > sparse_feature_value.embedx_w_index()) {
for (auto i = sparse_feature_value.embedx_w_index();
i < sparse_feature_value.dim(); ++i) {
os << " " << v[i];
}
}
return os.str();
}
int SparseAccessor::parse_from_string(const std::string& str, float* value) {
int embedx_dim = _config.embedx_dim();
_embedx_sgd_rule->init_value(
value + sparse_feature_value.embedx_w_index(),
value + sparse_feature_value.embedx_g2sum_index());
auto ret = paddle::string::str_to_float(str.data(), value);
CHECK(ret >= 6) << "expect more than 6 real:" << ret;
return ret;
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include "paddle/fluid/distributed/common/registerer.h"
#include "paddle/fluid/distributed/ps.pb.h"
#include "paddle/fluid/distributed/ps/table/accessor.h"
#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
namespace paddle {
namespace distributed {
// no show click, for word2vec(DownpourSparseValueAccessor)
class SparseAccessor : public ValueAccessor {
public:
struct SparseFeatureValue {
/*
float slot;
float unseen_days;
float delta_score;
float show;
float click;
float embed_w;
std::vector<float> embed_g2sum;
std::vector<float> embedx_w;
std::<vector>float embedx_g2sum;
*/
int dim() { return 6 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; }
int dim_size(size_t dim, int embedx_dim) { return sizeof(float); }
int size() { return dim() * sizeof(float); }
int slot_index() { return 0; }
int unseen_days_index() { return slot_index() + 1; }
int delta_score_index() { return unseen_days_index() + 1; }
int show_index() { return delta_score_index() + 1; }
int click_index() { return show_index() + 1; }
int embed_w_index() { return click_index() + 1; }
int embed_g2sum_index() { return embed_w_index() + 1; }
int embedx_w_index() { return embed_g2sum_index() + embed_sgd_dim; }
int embedx_g2sum_index() { return embedx_w_index() + embedx_dim; }
float& unseen_days(float* val) { return val[unseen_days_index()]; }
float& delta_score(float* val) { return val[delta_score_index()]; }
float& show(float* val) { return val[show_index()]; }
float& click(float* val) { return val[click_index()]; }
float& slot(float* val) { return val[slot_index()]; }
float& embed_w(float* val) { return val[embed_w_index()]; }
float& embed_g2sum(float* val) { return val[embed_g2sum_index()]; }
float& embedx_w(float* val) { return val[embedx_w_index()]; }
float& embedx_g2sum(float* val) { return val[embedx_g2sum_index()]; }
int embed_sgd_dim;
int embedx_dim;
int embedx_sgd_dim;
};
struct SparsePushValue {
/*
float slot;
float show;
float click;
float embed_g;
std::vector<float> embedx_g;
*/
static int dim(int embedx_dim) { return 4 + embedx_dim; }
static int dim_size(int dim, int embedx_dim) { return sizeof(float); }
static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); }
static int slot_index() { return 0; }
static int show_index() { return SparsePushValue::slot_index() + 1; }
static int click_index() { return SparsePushValue::show_index() + 1; }
static int embed_g_index() { return SparsePushValue::click_index() + 1; }
static int embedx_g_index() { return SparsePushValue::embed_g_index() + 1; }
static float& slot(float* val) {
return val[SparsePushValue::slot_index()];
}
static float& show(float* val) {
return val[SparsePushValue::show_index()];
}
static float& click(float* val) {
return val[SparsePushValue::click_index()];
}
static float& embed_g(float* val) {
return val[SparsePushValue::embed_g_index()];
}
static float* embedx_g(float* val) {
return val + SparsePushValue::embedx_g_index();
}
};
struct SparsePullValue {
/*
float embed_w;
std::vector<float> embedx_w;
*/
static int dim(int embedx_dim) { return 1 + embedx_dim; }
static int dim_size(size_t dim) { return sizeof(float); }
static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); }
static int embed_w_index() { return 0; }
static int embedx_w_index() { return 1; }
static float& embed_w(float* val) {
return val[SparsePullValue::embed_w_index()];
}
static float* embedx_w(float* val) {
return val + SparsePullValue::embedx_w_index();
}
};
SparseAccessor() {}
virtual int initialize();
virtual void GetTableInfo(AccessorInfo& info);
virtual ~SparseAccessor() {}
// value维度
virtual size_t dim();
// value各个维度的size
virtual size_t dim_size(size_t dim);
// value各维度相加总size
virtual size_t size();
// value中mf动态长度部分总size大小, sparse下生效
virtual size_t mf_size();
// pull value维度
virtual size_t select_dim();
// pull value各个维度的size
virtual size_t select_dim_size(size_t dim);
// pull value各维度相加总size
virtual size_t select_size();
// push value维度
virtual size_t update_dim();
// push value各个维度的size
virtual size_t update_dim_size(size_t dim);
// push value各维度相加总size
virtual size_t update_size();
// 判断该value是否进行shrink
virtual bool shrink(float* value);
// 判断该value是否保存到ssd
// virtual bool save_ssd(float* value);
virtual bool need_extend_mf(float* value);
virtual bool has_mf(size_t size);
// 判断该value是否在save阶段dump,
// param作为参数用于标识save阶段,如downpour的xbox与batch_model
// param = 0, save all feature
// param = 1, save delta feature
// param = 2, save xbox base feature
bool save(float* value, int param) override;
// update delta_score and unseen_days after save
void update_stat_after_save(float* value, int param) override;
// keys不存在时,为values生成随机值
// 要求value的内存由外部调用者分配完毕
virtual int32_t create(float** value, size_t num);
// 从values中选取到select_values中
virtual int32_t select(float** select_values, const float** values,
size_t num);
// 将update_values聚合到一起
virtual int32_t merge(float** update_values,
const float** other_update_values, size_t num);
// 将update_values聚合到一起,通过it.next判定是否进入下一个key
// virtual int32_t merge(float** update_values, iterator it);
// 将update_values更新应用到values中
virtual int32_t update(float** values, const float** update_values,
size_t num);
std::string parse_to_string(const float* value, int param) override;
int32_t parse_from_string(const std::string& str, float* v) override;
virtual bool create_value(int type, const float* value);
// 这个接口目前只用来取show
float get_field(float* value, const std::string& name) override {
// CHECK(name == "show");
if (name == "show") {
return sparse_feature_value.show(value);
}
return 0.0;
}
private:
// float show_click_score(float show, float click);
// SparseValueSGDRule* _embed_sgd_rule;
// SparseValueSGDRule* _embedx_sgd_rule;
// SparseFeatureValue sparse_feature_value;
float _show_click_decay_rate;
int32_t _ssd_unseenday_threshold;
public: // TODO(zhaocaibei123): it should be private, but we make it public
// for unit test
SparseFeatureValue sparse_feature_value;
float show_click_score(float show, float click);
SparseValueSGDRule* _embed_sgd_rule;
SparseValueSGDRule* _embedx_sgd_rule;
};
} // namespace distributed
} // namespace paddle
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#endif #endif
#include "paddle/fluid/distributed/ps/table/ctr_accessor.h" #include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h" #include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
#include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
#include "paddle/fluid/distributed/ps/table/tensor_accessor.h" #include "paddle/fluid/distributed/ps/table/tensor_accessor.h"
#include "paddle/fluid/distributed/ps/table/tensor_table.h" #include "paddle/fluid/distributed/ps/table/tensor_table.h"
...@@ -49,6 +50,7 @@ REGISTER_PSCORE_CLASS(Table, MemorySparseTable); ...@@ -49,6 +50,7 @@ REGISTER_PSCORE_CLASS(Table, MemorySparseTable);
REGISTER_PSCORE_CLASS(Table, MemorySparseGeoTable); REGISTER_PSCORE_CLASS(Table, MemorySparseGeoTable);
REGISTER_PSCORE_CLASS(ValueAccessor, CommMergeAccessor); REGISTER_PSCORE_CLASS(ValueAccessor, CommMergeAccessor);
REGISTER_PSCORE_CLASS(ValueAccessor, CtrCommonAccessor); REGISTER_PSCORE_CLASS(ValueAccessor, CtrCommonAccessor);
REGISTER_PSCORE_CLASS(ValueAccessor, SparseAccessor);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule);
......
...@@ -71,11 +71,22 @@ class FleetWrapper : public PSWrapper { ...@@ -71,11 +71,22 @@ class FleetWrapper : public PSWrapper {
} }
virtual int32_t Initialize(InitContext& context) { return 0; } virtual int32_t Initialize(InitContext& context) { return 0; }
// TODO(zhaocaibei123: later)
int32_t CopyTable(const uint64_t src_table_id, const uint64_t dest_table_id);
int32_t CopyTableByFeasign(const uint64_t src_table_id,
const uint64_t dest_table_id,
const std::vector<uint64_t>& feasign_list);
typedef std::function<void(int, int)> HeterCallBackFunc;
int RegisterHeterCallback(HeterCallBackFunc handler);
virtual void Stop() override; virtual void Stop() override;
virtual void Load(WrapperContext& context) override; virtual void Load(WrapperContext& context) override;
virtual void Save(WrapperContext& context) override; virtual void Save(WrapperContext& context) override;
// set client to client communication config // set client to client communication config
void SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms, void SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms,
int max_retry); int max_retry);
...@@ -168,7 +179,8 @@ class FleetWrapper : public PSWrapper { ...@@ -168,7 +179,8 @@ class FleetWrapper : public PSWrapper {
std::vector<const LoDTensor*>* inputs, std::vector<const LoDTensor*>* inputs,
const LoDTensor* shows, const LoDTensor* shows,
const LoDTensor* clicks, const LoDTensor* clicks,
std::vector<LoDTensor*>* outputs); std::vector<LoDTensor*>* outputs,
bool use_cvm_op = false);
// Push sparse variables to server in Async mode // Push sparse variables to server in Async mode
// Param<In>: scope, table_id, fea_keys, sparse_grad_names // Param<In>: scope, table_id, fea_keys, sparse_grad_names
// Param<Out>: push_values, push_sparse_status // Param<Out>: push_values, push_sparse_status
...@@ -185,12 +197,7 @@ class FleetWrapper : public PSWrapper { ...@@ -185,12 +197,7 @@ class FleetWrapper : public PSWrapper {
const std::vector<framework::ProgramDesc>& server_sub_program = {}); const std::vector<framework::ProgramDesc>& server_sub_program = {});
// init trainer // init trainer
void InitWorker(const std::string& dist_desc, void InitWorker(const std::string& dist_desc,
const std::vector<std::string>& host_sign_list, Scope* scope, const std::vector<std::string>& host_sign_list, int index);
const RpcCtxMap& send_ctx,
const std::unordered_map<uint64_t, std::vector<std::string>>&
dense_varnames,
const std::map<std::string, std::string>& envs, int node_num,
int index);
// stop server // stop server
void StopServer(); void StopServer();
...@@ -200,6 +207,8 @@ class FleetWrapper : public PSWrapper { ...@@ -200,6 +207,8 @@ class FleetWrapper : public PSWrapper {
uint64_t RunServer(const std::string& ip, uint32_t port); uint64_t RunServer(const std::string& ip, uint32_t port);
// get client info // get client info
std::vector<uint64_t> GetClientsInfo(); std::vector<uint64_t> GetClientsInfo();
// set client info
int SetClients(std::vector<uint64_t>& host_sign_list); // NOLINT
// create client to client connection // create client to client connection
void CreateClient2ClientConnection(); void CreateClient2ClientConnection();
// flush all push requests // flush all push requests
...@@ -255,10 +264,15 @@ class FleetWrapper : public PSWrapper { ...@@ -255,10 +264,15 @@ class FleetWrapper : public PSWrapper {
// this performs better than rand_r, especially large data // this performs better than rand_r, especially large data
std::default_random_engine& LocalRandomEngine(); std::default_random_engine& LocalRandomEngine();
// for init worker
void InitGFlag(const std::string& gflags);
static std::shared_ptr<paddle::distributed::PSCore> pserver_ptr_; static std::shared_ptr<paddle::distributed::PSCore> pserver_ptr_;
static std::shared_ptr<paddle::distributed::PSClient> worker_ptr_;
private: private:
static std::shared_ptr<FleetWrapper> s_instance_; static std::shared_ptr<FleetWrapper> s_instance_;
paddle::distributed::PaddlePSEnvironment ps_env_;
size_t GetAbsoluteSum(size_t start, size_t end, size_t level, size_t GetAbsoluteSum(size_t start, size_t end, size_t level,
const framework::LoD& lod); const framework::LoD& lod);
......
...@@ -74,7 +74,7 @@ TEST(MemorySparseTable, SGD) { ...@@ -74,7 +74,7 @@ TEST(MemorySparseTable, SGD) {
std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1}; std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
std::vector<float> init_values; std::vector<float> init_values;
init_values.resize(init_keys.size() * (emb_dim + 1)); init_values.resize(init_keys.size() * (emb_dim + 3));
auto value = PullSparseValue(init_keys, init_fres, emb_dim); auto value = PullSparseValue(init_keys, init_fres, emb_dim);
table->pull_sparse(init_values.data(), value); table->pull_sparse(init_values.data(), value);
...@@ -119,11 +119,11 @@ TEST(MemorySparseTable, SGD) { ...@@ -119,11 +119,11 @@ TEST(MemorySparseTable, SGD) {
} }
std::vector<float> pull_values; std::vector<float> pull_values;
pull_values.resize(init_keys.size() * (emb_dim + 1)); pull_values.resize(init_keys.size() * (emb_dim + 3));
table->pull_sparse(pull_values.data(), value); table->pull_sparse(pull_values.data(), value);
for (size_t i = 0; i < init_keys.size(); ++i) { for (size_t i = 0; i < init_keys.size(); ++i) {
for (size_t j = 0; j < emb_dim + 1; ++j) { for (size_t j = 2; j < emb_dim + 3; ++j) {
auto update_val = init_values[i * (emb_dim + 1) + j] - auto update_val = init_values[i * (emb_dim + 1) + j] -
0.1 * total_gradients[3 + i * (emb_dim + 4) + j]; 0.1 * total_gradients[3 + i * (emb_dim + 4) + j];
VLOG(3) << total_gradients[i * (emb_dim + 4) + j + 3] << ":" VLOG(3) << total_gradients[i * (emb_dim + 4) + j + 3] << ":"
......
...@@ -24,6 +24,9 @@ ...@@ -24,6 +24,9 @@
#include "paddle/fluid/eager/utils.h" #include "paddle/fluid/eager/utils.h"
#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
namespace egr { namespace egr {
......
...@@ -235,7 +235,7 @@ if(WITH_PYTHON) ...@@ -235,7 +235,7 @@ if(WITH_PYTHON)
py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto) py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto) py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto) py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto)
py_proto_compile(ps_py_proto SRCS ps.proto) py_proto_compile(ps_py_proto SRCS the_one_ps.proto)
#Generate an empty \ #Generate an empty \
#__init__.py to make framework_py_proto as a valid python module. #__init__.py to make framework_py_proto as a valid python module.
add_custom_target(fleet_proto_init ALL add_custom_target(fleet_proto_init ALL
...@@ -249,7 +249,7 @@ if(WITH_PYTHON) ...@@ -249,7 +249,7 @@ if(WITH_PYTHON)
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMAND cp ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMAND cp the_one_ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto
...@@ -261,7 +261,7 @@ if(WITH_PYTHON) ...@@ -261,7 +261,7 @@ if(WITH_PYTHON)
add_custom_command(TARGET framework_py_proto POST_BUILD add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND copy /Y *.py ${proto_dstpath} COMMAND copy /Y *.py ${proto_dstpath}
COMMAND copy /Y ps_pb2.py ${fleet_proto_dstpath} COMMAND copy /Y the_one_ps_pb2.py ${fleet_proto_dstpath}
COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMENT "Copy generated python proto into directory paddle/fluid/proto."
COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto." COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
...@@ -314,7 +314,7 @@ if(WITH_DISTRIBUTE) ...@@ -314,7 +314,7 @@ if(WITH_DISTRIBUTE)
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc heter_pipeline_trainer.cc heterxpu_trainer.cc heter_pipeline_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
downpour_worker.cc downpour_worker_opt.cc downpour_worker.cc downpour_lite_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
index_sampler index_wrapper sampler index_dataset_proto index_sampler index_wrapper sampler index_dataset_proto
...@@ -329,6 +329,7 @@ if(WITH_DISTRIBUTE) ...@@ -329,6 +329,7 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(multi_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(multi_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(hogwild_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(hogwild_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(downpour_lite_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(heter_section_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(heter_section_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(heter_pipeline_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(heter_pipeline_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else() else()
......
...@@ -27,6 +27,10 @@ limitations under the License. */ ...@@ -27,6 +27,10 @@ limitations under the License. */
#include <utility> // NOLINT #include <utility> // NOLINT
#include <vector> #include <vector>
#if defined(PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#endif
#include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/framework/heter_util.h"
...@@ -107,7 +111,12 @@ class PullDenseWorker { ...@@ -107,7 +111,12 @@ class PullDenseWorker {
bool CheckUpdateParam(uint64_t table_id); bool CheckUpdateParam(uint64_t table_id);
private: private:
#if defined(PADDLE_WITH_PSCORE)
std::shared_ptr<paddle::distributed::FleetWrapper> fleet_ptr_;
#else
std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_; std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
#endif
PullDenseWorkerParameter param_; PullDenseWorkerParameter param_;
DownpourWorkerParameter dwp_param_; DownpourWorkerParameter dwp_param_;
Scope* root_scope_; Scope* root_scope_;
...@@ -341,6 +350,79 @@ class DownpourWorker : public HogwildWorker { ...@@ -341,6 +350,79 @@ class DownpourWorker : public HogwildWorker {
// std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_; // std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
}; };
// Based on DownpourWorker,remove push pull code into operator
#if defined(PADDLE_WITH_PSCORE)
class DownpourLiteWorker : public HogwildWorker {
public:
DownpourLiteWorker() {}
virtual ~DownpourLiteWorker() {}
virtual void Initialize(const TrainerDesc& desc);
virtual void TrainFiles();
virtual void TrainFilesWithProfiler();
protected:
std::shared_ptr<paddle::distributed::FleetWrapper> fleet_ptr_;
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
void PushGradients();
void CopySparseTable();
void CopyDenseTable();
void CopyDenseVars();
DownpourWorkerParameter param_;
// copy table
CopyTableConfig copy_table_config_;
std::vector<std::pair<uint64_t, uint64_t>> copy_sparse_tables_;
std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
// actually pushed feasign of each table
std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
std::map<uint64_t, std::vector<std::string>> sparse_key_names_;
// feasign
std::map<uint64_t, std::vector<uint64_t>> features_;
// feasign embedding
std::map<uint64_t, std::vector<std::vector<float>>> feature_values_;
std::map<uint64_t, std::vector<std::string>> sparse_value_names_;
// adjust ins weight
AdjustInsWeightConfig adjust_ins_weight_config_;
// check nan and inf during training
std::vector<std::string> check_nan_var_names_;
bool need_to_push_sparse_;
// feasign stats
std::map<uint64_t, std::vector<float>> feature_labels_;
std::map<uint64_t, std::vector<std::string>> sparse_grad_names_;
// feasign embedding gradient
std::map<uint64_t, std::vector<std::vector<float>>> feature_grads_;
std::vector<::std::future<int32_t>> push_sparse_status_;
bool dump_slot_;
bool need_to_push_dense_;
std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
float scale_datanorm_;
std::vector<::std::future<int32_t>> push_dense_status_;
// skipped ops
std::vector<std::string> skip_ops_;
// just save the value in param_ for easy access
std::map<uint64_t, std::string> label_var_name_;
std::map<uint64_t, std::vector<std::string>> dense_value_names_;
std::map<uint64_t, uint64_t> table_dependency_;
std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
// multitask
std::map<int32_t, uint64_t> cond2table_map_;
std::set<uint64_t> condvalue_set_;
bool flag_partial_push_;
private:
// std::vector<std::string> dump_param_;
// just save the value in param_ for easy access
// std::map<uint64_t, std::string> label_var_name_;
// std::map<uint64_t, std::vector<std::string>> dense_value_names_;
std::shared_ptr<PullDenseWorker> _pull_dense_worker;
std::vector<float> nid_show_;
// std::map<uint64_t, uint64_t> table_dependency_;
// std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
};
#endif
class DownpourWorkerOpt : public DownpourWorker { class DownpourWorkerOpt : public DownpourWorker {
public: public:
DownpourWorkerOpt() {} DownpourWorkerOpt() {}
......
...@@ -67,6 +67,7 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorker); ...@@ -67,6 +67,7 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt); REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
#if defined(PADDLE_WITH_PSCORE) #if defined(PADDLE_WITH_PSCORE)
REGISTER_DEVICE_WORKER_CLASS(DownpourLiteWorker);
REGISTER_DEVICE_WORKER_CLASS(HeterSectionWorker); REGISTER_DEVICE_WORKER_CLASS(HeterSectionWorker);
#endif #endif
......
...@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if defined(PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#endif
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer.h"
...@@ -62,7 +66,11 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc, ...@@ -62,7 +66,11 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
} }
void DistMultiTrainer::RegisterHeterCallback() { void DistMultiTrainer::RegisterHeterCallback() {
#ifdef PADDLE_WITH_PSCORE
auto fleet_ptr = paddle::distributed::FleetWrapper::GetInstance();
#else
auto fleet_ptr = FleetWrapper::GetInstance(); auto fleet_ptr = FleetWrapper::GetInstance();
#endif
fleet_ptr->RegisterHeterCallback( fleet_ptr->RegisterHeterCallback(
[this](int worker, int taskid) { workers_[worker]->Schedule(taskid); }); [this](int worker, int taskid) { workers_[worker]->Schedule(taskid); });
} }
...@@ -93,7 +101,7 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program, ...@@ -93,7 +101,7 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program,
workers_[i]->SetRootScope(root_scope_); workers_[i]->SetRootScope(root_scope_);
workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->CreateDeviceResource(main_program); // Program
workers_[i]->BindingDataFeedMemory(); workers_[i]->BindingDataFeedMemory();
#ifdef PADDLE_WITH_PSLIB #if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
workers_[i]->CacheProgram(main_program); workers_[i]->CacheProgram(main_program);
#endif #endif
} }
...@@ -110,7 +118,7 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) { ...@@ -110,7 +118,7 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) {
} }
pull_dense_worker_->SetRootScope(root_scope_); pull_dense_worker_->SetRootScope(root_scope_);
pull_dense_worker_->Start(); pull_dense_worker_->Start();
#ifdef PADDLE_WITH_PSLIB #if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
for (int i = 0; i < thread_num_; ++i) { for (int i = 0; i < thread_num_; ++i) {
workers_[i]->GetXpuOpIndex(); workers_[i]->GetXpuOpIndex();
} }
...@@ -176,8 +184,12 @@ void DistMultiTrainer::Finalize() { ...@@ -176,8 +184,12 @@ void DistMultiTrainer::Finalize() {
pull_dense_worker_->Stop(); pull_dense_worker_->Stop();
root_scope_->DropKids(); root_scope_->DropKids();
// flush local client push queue // flush local client push queue
#ifdef PADDLE_WITH_PSCORE
auto fleet_ptr_ = paddle::distributed::FleetWrapper::GetInstance();
#else
auto fleet_ptr_ = FleetWrapper::GetInstance(); auto fleet_ptr_ = FleetWrapper::GetInstance();
#endif
fleet_ptr_->ClientFlush(); fleet_ptr_->ClientFlush();
} }
......
此差异已折叠。
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include <numeric> #include <numeric>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#if defined(PADDLE_WITH_PSLIB) #if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -38,7 +38,7 @@ limitations under the License. */ ...@@ -38,7 +38,7 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif #endif
#if defined(PADDLE_WITH_PSLIB) #if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -61,7 +61,13 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { ...@@ -61,7 +61,13 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
last_versions_[tid] = 0; last_versions_[tid] = 0;
current_version_[tid] = 0; current_version_[tid] = 0;
} }
#if defined(PADDLE_WITH_PSCORE)
fleet_ptr_ = paddle::distributed::FleetWrapper::GetInstance();
#else
fleet_ptr_ = FleetWrapper::GetInstance(); fleet_ptr_ = FleetWrapper::GetInstance();
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
copy_streams_.clear(); copy_streams_.clear();
#endif #endif
...@@ -170,6 +176,9 @@ void PullDenseWorker::PullDense(bool force_update) { ...@@ -170,6 +176,9 @@ void PullDenseWorker::PullDense(bool force_update) {
VLOG(3) << "pull dense " << force_update << " " << tid; VLOG(3) << "pull dense " << force_update << " " << tid;
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid], fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, false); &pull_dense_status_, false);
#elif defined(PADDLE_WITH_PSCORE)
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, true);
#else #else
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid], fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, true); &pull_dense_status_, true);
......
...@@ -30,6 +30,21 @@ namespace operators { ...@@ -30,6 +30,21 @@ namespace operators {
class AbsOp : public framework::OperatorWithKernel { class AbsOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_MKLDNN
if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN);
}
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class AbsOpMaker : public framework::OpProtoAndCheckerMaker { class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -72,8 +87,17 @@ class AbsGradOp : public framework::OperatorWithKernel { ...@@ -72,8 +87,17 @@ class AbsGradOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); auto input_data_type =
return framework::OpKernelType(dtype, ctx.GetPlace()); framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_MKLDNN
if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN);
}
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
} }
}; };
......
...@@ -390,6 +390,204 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> { ...@@ -390,6 +390,204 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
} }
} }
}; };
template <typename T>
class NPUConv3dKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* input = ctx.Input<Tensor>("Input");
const Tensor* filter = ctx.Input<Tensor>("Filter");
Tensor* output = ctx.Output<Tensor>("Output");
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
PADDLE_ENFORCE_EQ(data_format, "NCDHW",
platform::errors::Unimplemented(
"the data_format must be NCDHW in "
"the npu kernel of conv3d, but got data_format "
"= [%s]",
data_format));
PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented(
"the groups must be 1 in "
"the npu kernel of conv3d, but got groups "
"= [%d]",
groups));
output->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
auto input_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(input->dims(), dev_ctx);
auto filter_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter->dims(), dev_ctx);
auto output_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(output->dims(), dev_ctx);
input_tensor.ShareDataWith(*input);
filter_tensor.ShareDataWith(*filter);
output_tensor.ShareDataWith(*output);
input_tensor.set_layout(DataLayout::kNCDHW);
filter_tensor.set_layout(DataLayout::kNCDHW);
output_tensor.set_layout(DataLayout::kNCDHW);
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter->dims();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
std::vector<int> strides_vec(5, 1);
std::vector<int> dilations_vec(5, 1);
strides_vec[2] = strides[0];
strides_vec[3] = strides[1];
strides_vec[4] = strides[2];
dilations_vec[2] = dilations[0];
dilations_vec[3] = dilations[1];
dilations_vec[4] = dilations[2];
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
const auto& runner =
NpuOpRunner("Conv3D", {input_tensor, filter_tensor}, {output_tensor},
{{"strides", strides_vec},
{"pads", paddings},
{"dilations", dilations_vec},
{"groups", groups},
{"data_format", data_format}});
runner.Run(stream);
}
};
template <typename T>
class NPUConv3dGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* input = ctx.Input<Tensor>("Input");
const Tensor* filter = ctx.Input<Tensor>("Filter");
const Tensor* output_grad =
ctx.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
PADDLE_ENFORCE_EQ(data_format, "NCDHW",
platform::errors::Unimplemented(
"the data_format must be NCDHW in "
"the npu kernel of conv3d, but got data_format "
"= [%s]",
data_format));
PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented(
"the groups must be 1 in "
"the npu kernel of conv3d, but got groups "
"= [%d]",
groups));
auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
auto input_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(input->dims(), dev_ctx);
auto filter_tensor =
ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter->dims(), dev_ctx);
auto output_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
output_grad->dims(), dev_ctx);
input_tensor.ShareDataWith(*input);
filter_tensor.ShareDataWith(*filter);
output_grad_tensor.ShareDataWith(*output_grad);
input_tensor.set_layout(DataLayout::kNCDHW);
filter_tensor.set_layout(DataLayout::kNCDHW);
output_grad_tensor.set_layout(DataLayout::kNCDHW);
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter->dims();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
in_data_dims, strides, ksize);
std::vector<int> strides_vec(5, 1);
std::vector<int> dilations_vec(5, 1);
strides_vec[2] = strides[0];
strides_vec[3] = strides[1];
strides_vec[4] = strides[2];
dilations_vec[2] = dilations[0];
dilations_vec[3] = dilations[1];
dilations_vec[4] = dilations[2];
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
Tensor filter_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
filter_grad->dims(), dev_ctx);
filter_grad_tensor.ShareDataWith(*filter_grad);
filter_grad_tensor.set_layout(DataLayout::kNCDHW);
const auto& runner = NpuOpRunner(
"Conv3DBackpropFilterD", {input_tensor, output_grad_tensor},
{filter_grad_tensor}, {{"filter_size", filter_shape_vec},
{"strides", strides_vec},
{"pads", paddings},
{"dilations", dilations_vec},
{"groups", groups},
{"data_format", data_format}});
runner.Run(stream);
}
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());
Tensor input_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
input_grad->dims(), dev_ctx);
input_grad_tensor.ShareDataWith(*input_grad);
input_grad_tensor.set_layout(DataLayout::kNCDHW);
const auto& runner = NpuOpRunner(
"Conv3DBackpropInputD", {filter_tensor, output_grad_tensor},
{input_grad_tensor}, {{"input_size", input_shape_vec},
{"strides", strides_vec},
{"pads", paddings},
{"dilations", dilations_vec},
{"groups", groups},
{"data_format", data_format}});
runner.Run(stream);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -408,3 +606,9 @@ REGISTER_OP_NPU_KERNEL(conv2d, ops::NPUConvOpKernel<float>, ...@@ -408,3 +606,9 @@ REGISTER_OP_NPU_KERNEL(conv2d, ops::NPUConvOpKernel<float>,
REGISTER_OP_NPU_KERNEL(conv2d_grad, ops::NPUConvGradOpKernel<float>, REGISTER_OP_NPU_KERNEL(conv2d_grad, ops::NPUConvGradOpKernel<float>,
ops::NPUConvGradOpKernel<plat::float16>); ops::NPUConvGradOpKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL(conv3d, ops::NPUConv3dKernel<float>,
ops::NPUConv3dKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL(conv3d_grad, ops::NPUConv3dGradKernel<float>,
ops::NPUConv3dGradKernel<plat::float16>);
...@@ -315,15 +315,7 @@ using ExpMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< ...@@ -315,15 +315,7 @@ using ExpMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
namespace ops = paddle::operators; namespace ops = paddle::operators;
#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \ #define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationKernel<ops::functor<float>>); \
REGISTER_OP_KERNEL( \
act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
#define REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(act_type, functor, \
grad_functor) \
REGISTER_OP_KERNEL( \ REGISTER_OP_KERNEL( \
act_type, MKLDNN, ::paddle::platform::CPUPlace, \ act_type, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationKernel<ops::functor<float>>, \ ops::MKLDNNActivationKernel<ops::functor<float>>, \
...@@ -339,30 +331,27 @@ namespace ops = paddle::operators; ...@@ -339,30 +331,27 @@ namespace ops = paddle::operators;
ops::MKLDNNActivationKernel<ops::functor<float>>); ops::MKLDNNActivationKernel<ops::functor<float>>);
#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \
__macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \
__macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
__macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor); \
__macro(hard_swish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \
__macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradUseOutFunctor); \
__macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); \ __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); \
__macro(elu, EluMKLDNNFunctor, EluMKLDNNGradUseOutFunctor); \ __macro(elu, EluMKLDNNFunctor, EluMKLDNNGradUseOutFunctor); \
__macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor); __macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor); \
__macro(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); \
__macro(hard_swish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \
__macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
__macro(mish, MishMKLDNNFunctor, MishMKLDNNGradFunctor); \
__macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
__macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \
__macro(sigmoid, SigmoidMKLDNNFunctor, SigmoidMKLDNNGradUseOutFunctor); \
__macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradUseOutFunctor); \
__macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor); \
__macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradUseOutFunctor);
FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL); FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(round, RoundMKLDNNFunctor);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor, REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(round, RoundMKLDNNFunctor);
ReluMKLDNNGradFunctor);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor,
GeluMKLDNNGradFunctor);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor,
SigmoidMKLDNNGradUseOutFunctor);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sqrt, SqrtMKLDNNFunctor,
SqrtMKLDNNGradUseOutFunctor);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(mish, MishMKLDNNFunctor,
MishMKLDNNGradFunctor);
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL( REGISTER_OP_KERNEL(
softplus, MKLDNN, paddle::platform::CPUPlace, softplus, MKLDNN, paddle::platform::CPUPlace,
ops::MKLDNNActivationKernel<ops::SoftplusMKLDNNFunctor<float>>); ops::MKLDNNActivationKernel<ops::SoftplusMKLDNNFunctor<float>>,
ops::MKLDNNActivationKernel<
ops::SoftplusMKLDNNFunctor<paddle::platform::bfloat16>>);
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
#include <algorithm> #include <algorithm>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -52,15 +51,13 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> { ...@@ -52,15 +51,13 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
auto inputs = context.MultiInput<framework::LoDTensor>("Ids"); auto inputs = context.MultiInput<framework::LoDTensor>("Ids");
auto outputs = context.MultiOutput<framework::LoDTensor>("Outputs"); auto outputs = context.MultiOutput<framework::LoDTensor>("Outputs");
// auto fleet = distributed::FleetWrapper::GetInstance(); auto fleet = distributed::FleetWrapper::GetInstance();
auto *communicator = (distributed::AsyncCommunicator *)
distributed::Communicator::GetInstance();
if (platform::is_cpu_place(context.GetPlace())) { if (platform::is_cpu_place(context.GetPlace())) {
communicator->PullSparseToTensorSync( fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(table_id), emb_dim, static_cast<uint64_t>(padding_idx),
static_cast<uint64_t>(padding_idx), context.GetPlace(), !is_test, context.GetPlace(), !is_test, &inputs,
&inputs, &outputs); &outputs);
} else { } else {
auto inputs_variable = context.MultiInputVar("Ids"); auto inputs_variable = context.MultiInputVar("Ids");
auto outputs_variable = context.MultiOutputVar("Outputs"); auto outputs_variable = context.MultiOutputVar("Outputs");
...@@ -96,10 +93,10 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> { ...@@ -96,10 +93,10 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
} }
// use fleet->PullSparse // use fleet->PullSparse
communicator->PullSparseToTensorSync( fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(table_id), emb_dim, static_cast<uint64_t>(padding_idx),
static_cast<uint64_t>(padding_idx), cpu_place, !is_test, cpu_place, !is_test, &tmp_input_vec,
&tmp_input_vec, &tmp_output_vec); &tmp_output_vec);
// cp temp to origin // cp temp to origin
for (size_t idx = 0; idx < output_var_size; ++idx) { for (size_t idx = 0; idx < output_var_size; ++idx) {
......
...@@ -106,6 +106,9 @@ class DistributedPushSparseOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -106,6 +106,9 @@ class DistributedPushSparseOpMaker : public framework::OpProtoAndCheckerMaker {
"for training.") "for training.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("use_cvm_op", "(boolean, default false) Use cvm op or not.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Lookup Tablel Prefetch Operator. Lookup Tablel Prefetch Operator.
This operator is used to perform lookup on parameter W, This operator is used to perform lookup on parameter W,
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
#include <algorithm> #include <algorithm>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -32,22 +31,20 @@ class DistributedPushSparseKernel : public framework::OpKernel<T> { ...@@ -32,22 +31,20 @@ class DistributedPushSparseKernel : public framework::OpKernel<T> {
auto padding_idx = context.Attr<int64_t>("padding_idx"); auto padding_idx = context.Attr<int64_t>("padding_idx");
auto table_id = context.Attr<int>("table_id"); auto table_id = context.Attr<int>("table_id");
auto emb_dim = context.Attr<int>("size"); auto emb_dim = context.Attr<int>("size");
VLOG(1) << "push_sparse.h::emb_dim: " << emb_dim; auto use_cvm_op = context.Attr<bool>("use_cvm_op");
auto inputs = context.MultiInput<framework::LoDTensor>("Ids"); auto inputs = context.MultiInput<framework::LoDTensor>("Ids");
auto shows = context.Input<framework::LoDTensor>("Shows"); auto shows = context.Input<framework::LoDTensor>("Shows");
auto clks = context.Input<framework::LoDTensor>("Clicks"); auto clks = context.Input<framework::LoDTensor>("Clicks");
auto outputs = context.MultiOutput<framework::LoDTensor>("Outputs"); auto outputs = context.MultiOutput<framework::LoDTensor>("Outputs");
// auto fleet = distributed::FleetWrapper::GetInstance(); auto fleet = distributed::FleetWrapper::GetInstance();
auto *communicator = (distributed::AsyncCommunicator *)
distributed::Communicator::GetInstance();
if (platform::is_cpu_place(context.GetPlace())) { if (platform::is_cpu_place(context.GetPlace())) {
communicator->PushSparseFromTensorAsync( fleet->PushSparseFromTensorAsync(static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(table_id), emb_dim, static_cast<uint64_t>(padding_idx),
static_cast<uint64_t>(padding_idx), context.GetPlace(), &inputs, context.GetPlace(), &inputs, shows, clks,
shows, clks, &outputs); &outputs, use_cvm_op);
} else { } else {
auto inputs_variable = context.MultiInputVar("Ids"); auto inputs_variable = context.MultiInputVar("Ids");
auto outputs_variable = context.MultiOutputVar("Outputs"); auto outputs_variable = context.MultiOutputVar("Outputs");
...@@ -94,7 +91,7 @@ class DistributedPushSparseKernel : public framework::OpKernel<T> { ...@@ -94,7 +91,7 @@ class DistributedPushSparseKernel : public framework::OpKernel<T> {
} }
// use fleet->PullSparse // use fleet->PullSparse
communicator->PushSparseFromTensorAsync( fleet->PushSparseFromTensorAsync(
static_cast<uint64_t>(table_id), emb_dim, static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(padding_idx), context.GetPlace(), static_cast<uint64_t>(padding_idx), context.GetPlace(),
&tmp_input_vec, tmp_shows_tensor, tmp_clicks_tensor, &tmp_output_vec); &tmp_input_vec, tmp_shows_tensor, tmp_clicks_tensor, &tmp_output_vec);
......
...@@ -53,7 +53,7 @@ class SendOp : public framework::OperatorBase { ...@@ -53,7 +53,7 @@ class SendOp : public framework::OperatorBase {
send_varnames[0] != "@PS_STEP_COUNTER@") { send_varnames[0] != "@PS_STEP_COUNTER@") {
auto fleet = paddle::distributed::FleetWrapper::GetInstance(); auto fleet = paddle::distributed::FleetWrapper::GetInstance();
std::vector<::std::future<int32_t>> status; std::vector<::std::future<int32_t>> status;
fleet->PushDenseVarsAsync(scope, table_id, ins, &status, 0, -1); fleet->PushDenseVarsAsync(scope, table_id, ins, &status, -1, -1);
} else { } else {
auto* communicator = paddle::distributed::Communicator::GetInstance(); auto* communicator = paddle::distributed::Communicator::GetInstance();
if (communicator->Check(send_varnames)) { if (communicator->Check(send_varnames)) {
......
...@@ -47,6 +47,8 @@ static std::map<framework::proto::VarType::Type, aclDataType> ...@@ -47,6 +47,8 @@ static std::map<framework::proto::VarType::Type, aclDataType>
static std::map<DataLayout, aclFormat> DATA_LAYOUT_2_ACL_FORMAT = { static std::map<DataLayout, aclFormat> DATA_LAYOUT_2_ACL_FORMAT = {
{DataLayout::kNCHW, ACL_FORMAT_NCHW}, {DataLayout::kNCHW, ACL_FORMAT_NCHW},
{DataLayout::kNHWC, ACL_FORMAT_NHWC}, {DataLayout::kNHWC, ACL_FORMAT_NHWC},
{DataLayout::kNCDHW, ACL_FORMAT_NCDHW},
{DataLayout::kNDHWC, ACL_FORMAT_NDHWC},
{DataLayout::kAnyLayout, ACL_FORMAT_ND}, {DataLayout::kAnyLayout, ACL_FORMAT_ND},
}; };
......
...@@ -77,6 +77,8 @@ void BindDistFleetWrapper(py::module* m) { ...@@ -77,6 +77,8 @@ void BindDistFleetWrapper(py::module* m) {
.def("stop_worker", &FleetWrapper::FinalizeWorker) .def("stop_worker", &FleetWrapper::FinalizeWorker)
.def("barrier", &FleetWrapper::BarrierWithTable) .def("barrier", &FleetWrapper::BarrierWithTable)
.def("shrink_sparse_table", &FleetWrapper::ShrinkSparseTable) .def("shrink_sparse_table", &FleetWrapper::ShrinkSparseTable)
.def("set_clients", &FleetWrapper::SetClients)
.def("get_client_info", &FleetWrapper::GetClientsInfo)
.def("create_client2client_connection", .def("create_client2client_connection",
&FleetWrapper::CreateClient2ClientConnection); &FleetWrapper::CreateClient2ClientConnection);
} }
......
...@@ -30,6 +30,8 @@ enum class DataLayout { ...@@ -30,6 +30,8 @@ enum class DataLayout {
SPARSE_COO, SPARSE_COO,
SPARSE_CSR, SPARSE_CSR,
NUM_DATA_LAYOUTS, NUM_DATA_LAYOUTS,
NDHWC,
NCDHW,
// See Note [ Why we need ALL in basic kernel key member? ] // See Note [ Why we need ALL in basic kernel key member? ]
ALL_LAYOUT = UNDEFINED, ALL_LAYOUT = UNDEFINED,
// Note: Unify phi DataLayout and fluid::framework::DataLayout, // Note: Unify phi DataLayout and fluid::framework::DataLayout,
...@@ -43,6 +45,8 @@ enum class DataLayout { ...@@ -43,6 +45,8 @@ enum class DataLayout {
kNHWC = NHWC, kNHWC = NHWC,
kNCHW = NCHW, kNCHW = NCHW,
kMKLDNN = MKLDNN, // all layouts supported by MKLDNN internally kMKLDNN = MKLDNN, // all layouts supported by MKLDNN internally
kNDHWC = NDHWC,
kNCDHW = NCDHW,
}; };
} // namespace experimental } // namespace experimental
...@@ -70,6 +74,10 @@ inline DataLayout StringToDataLayout(const std::string& str) { ...@@ -70,6 +74,10 @@ inline DataLayout StringToDataLayout(const std::string& str) {
return DataLayout::SPARSE_COO; return DataLayout::SPARSE_COO;
} else if (s == "SPARSE_CSR") { } else if (s == "SPARSE_CSR") {
return DataLayout::SPARSE_CSR; return DataLayout::SPARSE_CSR;
} else if (s == "NDHWC") {
return DataLayout::kNDHWC;
} else if (s == "NCDHW") {
return DataLayout::kNCDHW;
} else { } else {
PD_THROW("Unknown data layout type string: ", s, "."); PD_THROW("Unknown data layout type string: ", s, ".");
} }
...@@ -89,6 +97,10 @@ inline std::string DataLayoutToString(const DataLayout& layout) { ...@@ -89,6 +97,10 @@ inline std::string DataLayoutToString(const DataLayout& layout) {
return "SPARSE_COO"; return "SPARSE_COO";
case DataLayout::SPARSE_CSR: case DataLayout::SPARSE_CSR:
return "SPARSE_CSR"; return "SPARSE_CSR";
case DataLayout::kNDHWC:
return "NDHWC";
case DataLayout::kNCDHW:
return "NCDHW";
default: default:
PD_THROW("Unknown Data Layout type ", static_cast<int>(layout), "."); PD_THROW("Unknown Data Layout type ", static_cast<int>(layout), ".");
} }
......
...@@ -259,7 +259,7 @@ PD_REGISTER_KERNEL(multiply_triple_grad, ...@@ -259,7 +259,7 @@ PD_REGISTER_KERNEL(multiply_triple_grad,
phi::dtype::bfloat16, phi::dtype::bfloat16,
phi::dtype::complex<float>, phi::dtype::complex<float>,
phi::dtype::complex<double>) {} phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(elementwise_fmax_grad, PD_REGISTER_KERNEL(fmax_grad,
CPU, CPU,
ALL_LAYOUT, ALL_LAYOUT,
phi::ElementwiseFMaxGradKernel, phi::ElementwiseFMaxGradKernel,
...@@ -268,7 +268,7 @@ PD_REGISTER_KERNEL(elementwise_fmax_grad, ...@@ -268,7 +268,7 @@ PD_REGISTER_KERNEL(elementwise_fmax_grad,
int, int,
int64_t) {} int64_t) {}
PD_REGISTER_KERNEL(elementwise_fmin_grad, PD_REGISTER_KERNEL(fmin_grad,
CPU, CPU,
ALL_LAYOUT, ALL_LAYOUT,
phi::ElementwiseFMinGradKernel, phi::ElementwiseFMinGradKernel,
......
...@@ -87,23 +87,11 @@ using complex128 = ::phi::dtype::complex<double>; ...@@ -87,23 +87,11 @@ using complex128 = ::phi::dtype::complex<double>;
// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
// using bfloat16 = ::phi::dtype::bfloat16; // using bfloat16 = ::phi::dtype::bfloat16;
PD_REGISTER_KERNEL(elementwise_fmax, PD_REGISTER_KERNEL(
CPU, fmax, CPU, ALL_LAYOUT, phi::FMaxKernel, float, double, int, int64_t) {}
ALL_LAYOUT,
phi::ElementwiseFMaxKernel,
float,
double,
int,
int64_t) {}
PD_REGISTER_KERNEL(elementwise_fmin, PD_REGISTER_KERNEL(
CPU, fmin, CPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
ALL_LAYOUT,
phi::ElementwiseFMinKernel,
float,
double,
int,
int64_t) {}
PD_REGISTER_KERNEL(add_raw, PD_REGISTER_KERNEL(add_raw,
CPU, CPU,
......
...@@ -20,18 +20,18 @@ ...@@ -20,18 +20,18 @@
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void ElementwiseFMaxKernel(const Context& dev_ctx, void FMaxKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y,
int axis, int axis,
DenseTensor* out); DenseTensor* out);
template <typename T, typename Context> template <typename T, typename Context>
void ElementwiseFMinKernel(const Context& dev_ctx, void FMinKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y,
int axis, int axis,
DenseTensor* out); DenseTensor* out);
template <typename T, typename Context> template <typename T, typename Context>
void AddRawKernel(const Context& dev_ctx, void AddRawKernel(const Context& dev_ctx,
......
...@@ -282,7 +282,7 @@ PD_REGISTER_KERNEL(multiply_triple_grad, ...@@ -282,7 +282,7 @@ PD_REGISTER_KERNEL(multiply_triple_grad,
phi::dtype::bfloat16, phi::dtype::bfloat16,
phi::dtype::complex<float>, phi::dtype::complex<float>,
phi::dtype::complex<double>) {} phi::dtype::complex<double>) {}
PD_REGISTER_KERNEL(elementwise_fmax_grad, PD_REGISTER_KERNEL(fmax_grad,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
phi::ElementwiseFMaxGradKernel, phi::ElementwiseFMaxGradKernel,
...@@ -291,7 +291,7 @@ PD_REGISTER_KERNEL(elementwise_fmax_grad, ...@@ -291,7 +291,7 @@ PD_REGISTER_KERNEL(elementwise_fmax_grad,
int, int,
int64_t) {} int64_t) {}
PD_REGISTER_KERNEL(elementwise_fmin_grad, PD_REGISTER_KERNEL(fmin_grad,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
phi::ElementwiseFMinGradKernel, phi::ElementwiseFMinGradKernel,
......
...@@ -57,23 +57,11 @@ using bfloat16 = phi::dtype::bfloat16; ...@@ -57,23 +57,11 @@ using bfloat16 = phi::dtype::bfloat16;
using complex64 = ::phi::dtype::complex<float>; using complex64 = ::phi::dtype::complex<float>;
using complex128 = ::phi::dtype::complex<double>; using complex128 = ::phi::dtype::complex<double>;
PD_REGISTER_KERNEL(elementwise_fmax, PD_REGISTER_KERNEL(
GPU, fmax, GPU, ALL_LAYOUT, phi::FMaxKernel, float, double, int, int64_t) {}
ALL_LAYOUT,
phi::ElementwiseFMaxKernel,
float,
double,
int,
int64_t) {}
PD_REGISTER_KERNEL(elementwise_fmin, PD_REGISTER_KERNEL(
GPU, fmin, GPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
ALL_LAYOUT,
phi::ElementwiseFMinKernel,
float,
double,
int,
int64_t) {}
PD_REGISTER_KERNEL(add_raw, PD_REGISTER_KERNEL(add_raw,
GPU, GPU,
......
...@@ -23,22 +23,22 @@ ...@@ -23,22 +23,22 @@
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void ElementwiseFMaxKernel(const Context& dev_ctx, void FMaxKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y,
int axis, int axis,
DenseTensor* out) { DenseTensor* out) {
dev_ctx.template Alloc<T>(out); dev_ctx.template Alloc<T>(out);
funcs::ElementwiseCompute<funcs::FMaxFunctor<T>, T, T>( funcs::ElementwiseCompute<funcs::FMaxFunctor<T>, T, T>(
dev_ctx, x, y, axis, funcs::FMaxFunctor<T>(), out); dev_ctx, x, y, axis, funcs::FMaxFunctor<T>(), out);
} }
template <typename T, typename Context> template <typename T, typename Context>
void ElementwiseFMinKernel(const Context& dev_ctx, void FMinKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y,
int axis, int axis,
DenseTensor* out) { DenseTensor* out) {
dev_ctx.template Alloc<T>(out); dev_ctx.template Alloc<T>(out);
funcs::ElementwiseCompute<funcs::FMinFunctor<T>, T, T>( funcs::ElementwiseCompute<funcs::FMinFunctor<T>, T, T>(
dev_ctx, x, y, axis, funcs::FMinFunctor<T>(), out); dev_ctx, x, y, axis, funcs::FMinFunctor<T>(), out);
......
...@@ -19,25 +19,19 @@ namespace phi { ...@@ -19,25 +19,19 @@ namespace phi {
KernelSignature ElementwiseAddOpArgumentMapping( KernelSignature ElementwiseAddOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
int axis = paddle::any_cast<int>(ctx.Attr("axis")); int axis = paddle::any_cast<int>(ctx.Attr("axis"));
if (ctx.IsDenseTensorInput("X")) { if (axis == -1) {
if (axis == -1) { return KernelSignature("add", {"X", "Y"}, {}, {"Out"});
return KernelSignature("add", {"X", "Y"}, {}, {"Out"});
}
return KernelSignature("add_raw", {"X", "Y"}, {"axis"}, {"Out"});
} }
return KernelSignature("unregistered", {}, {}, {}); return KernelSignature("add_raw", {"X", "Y"}, {"axis"}, {"Out"});
} }
KernelSignature ElementwiseSubOpArgumentMapping( KernelSignature ElementwiseSubOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
int axis = paddle::any_cast<int>(ctx.Attr("axis")); int axis = paddle::any_cast<int>(ctx.Attr("axis"));
if (ctx.IsDenseTensorInput("X")) { if (axis == -1) {
if (axis == -1) { return KernelSignature("subtract", {"X", "Y"}, {}, {"Out"});
return KernelSignature("subtract", {"X", "Y"}, {}, {"Out"});
}
return KernelSignature("subtract_raw", {"X", "Y"}, {"axis"}, {"Out"});
} }
return KernelSignature("unregistered", {}, {}, {}); return KernelSignature("subtract_raw", {"X", "Y"}, {"axis"}, {"Out"});
} }
KernelSignature ElementwiseMulOpArgumentMapping( KernelSignature ElementwiseMulOpArgumentMapping(
...@@ -55,24 +49,18 @@ KernelSignature ElementwiseMulOpArgumentMapping( ...@@ -55,24 +49,18 @@ KernelSignature ElementwiseMulOpArgumentMapping(
KernelSignature ElementwiseDivOpArgumentMapping( KernelSignature ElementwiseDivOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
int axis = paddle::any_cast<int>(ctx.Attr("axis")); int axis = paddle::any_cast<int>(ctx.Attr("axis"));
if (ctx.IsDenseTensorInput("X")) { if (axis == -1) {
if (axis == -1) { return KernelSignature("divide", {"X", "Y"}, {}, {"Out"});
return KernelSignature("divide", {"X", "Y"}, {}, {"Out"});
}
return KernelSignature("divide_raw", {"X", "Y"}, {"axis"}, {"Out"});
} }
return KernelSignature("unregistered", {}, {}, {}); return KernelSignature("divide_raw", {"X", "Y"}, {"axis"}, {"Out"});
} }
KernelSignature ElementwiseAddGradOpArgumentMapping( KernelSignature ElementwiseAddGradOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("X")) { return KernelSignature("add_grad",
return KernelSignature("add_grad", {"X", "Y", GradVarName("Out")},
{"X", "Y", GradVarName("Out")}, {"axis"},
{"axis"}, {GradVarName("X"), GradVarName("Y")});
{GradVarName("X"), GradVarName("Y")});
}
return KernelSignature("unregistered", {}, {}, {});
} }
KernelSignature ElementwiseAddDoubleGradOpArgumentMapping( KernelSignature ElementwiseAddDoubleGradOpArgumentMapping(
...@@ -91,13 +79,10 @@ KernelSignature ElementwiseAddTripleGradOpArgumentMapping( ...@@ -91,13 +79,10 @@ KernelSignature ElementwiseAddTripleGradOpArgumentMapping(
KernelSignature ElementwiseSubGradOpArgumentMapping( KernelSignature ElementwiseSubGradOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("X")) { return KernelSignature("subtract_grad",
return KernelSignature("subtract_grad", {"X", "Y", GradVarName("Out")},
{"X", "Y", GradVarName("Out")}, {"axis"},
{"axis"}, {GradVarName("X"), GradVarName("Y")});
{GradVarName("X"), GradVarName("Y")});
}
return KernelSignature("unregistered", {}, {}, {});
} }
KernelSignature ElementwiseSubDoubleGradOpArgumentMapping( KernelSignature ElementwiseSubDoubleGradOpArgumentMapping(
...@@ -116,7 +101,7 @@ KernelSignature ElementwiseDivGradOpArgumentMapping( ...@@ -116,7 +101,7 @@ KernelSignature ElementwiseDivGradOpArgumentMapping(
KernelSignature ElementwiseFMinGradOpArgumentMapping( KernelSignature ElementwiseFMinGradOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
return KernelSignature("elementwise_fmin_grad", return KernelSignature("fmin_grad",
{"X", "Y", GradVarName("Out")}, {"X", "Y", GradVarName("Out")},
{"axis"}, {"axis"},
{GradVarName("X"), GradVarName("Y")}); {GradVarName("X"), GradVarName("Y")});
...@@ -138,9 +123,19 @@ KernelSignature ElementwiseMulGradOpArgumentMapping( ...@@ -138,9 +123,19 @@ KernelSignature ElementwiseMulGradOpArgumentMapping(
{GradVarName("X"), GradVarName("Y")}); {GradVarName("X"), GradVarName("Y")});
} }
KernelSignature ElementwiseFMaxOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("fmax", {"X", "Y"}, {"axis"}, {"Out"});
}
KernelSignature ElementwiseFMinOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("fmin", {"X", "Y"}, {"axis"}, {"Out"});
}
KernelSignature ElementwiseFMaxGradOpArgumentMapping( KernelSignature ElementwiseFMaxGradOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
return KernelSignature("elementwise_fmax_grad", return KernelSignature("fmax_grad",
{"X", "Y", GradVarName("Out")}, {"X", "Y", GradVarName("Out")},
{"axis"}, {"axis"},
{GradVarName("X"), GradVarName("Y")}); {GradVarName("X"), GradVarName("Y")});
...@@ -179,6 +174,10 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad); ...@@ -179,6 +174,10 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad, multiply_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad, multiply_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad_grad, multiply_double_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad_grad, multiply_double_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_triple_grad, multiply_triple_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_triple_grad, multiply_triple_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmax, fmax);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmin, fmin);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmax_grad, fmax_grad);
PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmin_grad, fmin_grad);
PD_REGISTER_ARG_MAPPING_FN(elementwise_add, PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
phi::ElementwiseAddOpArgumentMapping); phi::ElementwiseAddOpArgumentMapping);
...@@ -208,9 +207,12 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad, ...@@ -208,9 +207,12 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad,
phi::ElementwiseMulDoubleGradOpArgumentMapping); phi::ElementwiseMulDoubleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad, PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad,
phi::ElementwiseMulTripleGradOpArgumentMapping); phi::ElementwiseMulTripleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax,
phi::ElementwiseFMaxOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin,
phi::ElementwiseFMinOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax_grad, PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax_grad,
phi::ElementwiseFMaxGradOpArgumentMapping); phi::ElementwiseFMaxGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin_grad, PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin_grad,
phi::ElementwiseFMinGradOpArgumentMapping); phi::ElementwiseFMinGradOpArgumentMapping);
...@@ -578,7 +578,7 @@ class Fleet(object): ...@@ -578,7 +578,7 @@ class Fleet(object):
@is_non_distributed_check @is_non_distributed_check
@inited_runtime_handler @inited_runtime_handler
def init_worker(self): def init_worker(self, scopes=None):
""" """
initialize `Communicator` for parameter server training. initialize `Communicator` for parameter server training.
...@@ -599,7 +599,7 @@ class Fleet(object): ...@@ -599,7 +599,7 @@ class Fleet(object):
fleet.init_worker() fleet.init_worker()
""" """
self._runtime_handle._init_worker() self._runtime_handle._init_worker(scopes)
@is_non_distributed_check @is_non_distributed_check
@inited_runtime_handler @inited_runtime_handler
...@@ -1419,6 +1419,21 @@ class Fleet(object): ...@@ -1419,6 +1419,21 @@ class Fleet(object):
# for more examples, please reference https://github.com/PaddlePaddle/FleetX # for more examples, please reference https://github.com/PaddlePaddle/FleetX
""" """
if not isinstance(loss, list):
return self._minimize_impl(loss, startup_program, parameter_list,
no_grad_set)
else:
if paddle.fluid.framework.in_dygraph_mode(
) or self._role_maker._is_non_distributed() or self._is_collective:
raise ValueError("loss can be list only in PS mode")
return self._minimize_losses_impl(loss, startup_program,
parameter_list, no_grad_set)
def _minimize_impl(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None):
context = {} context = {}
context["user_defined_strategy"] = copy.deepcopy( context["user_defined_strategy"] = copy.deepcopy(
self._user_defined_strategy) self._user_defined_strategy)
...@@ -1447,6 +1462,7 @@ class Fleet(object): ...@@ -1447,6 +1462,7 @@ class Fleet(object):
"sharding_degree"] "sharding_degree"]
context["origin_main_program"] = self.origin_main_program context["origin_main_program"] = self.origin_main_program
context["origin_main_programs"] = [self.origin_main_program]
context["loss"] = loss context["loss"] = loss
if startup_program == None: if startup_program == None:
self.origin_startup_program = \ self.origin_startup_program = \
...@@ -1457,6 +1473,7 @@ class Fleet(object): ...@@ -1457,6 +1473,7 @@ class Fleet(object):
startup_program.clone(for_test=False) startup_program.clone(for_test=False)
context["origin_startup_program"] = startup_program context["origin_startup_program"] = startup_program
context["origin_startup_programs"] = [startup_program]
context["role_maker"] = self._role_maker context["role_maker"] = self._role_maker
# Use the auto-parallel's routines instead # Use the auto-parallel's routines instead
...@@ -1512,6 +1529,8 @@ class Fleet(object): ...@@ -1512,6 +1529,8 @@ class Fleet(object):
copy_user_defined_strategy, can_not_apply_optimizer_list) copy_user_defined_strategy, can_not_apply_optimizer_list)
context["valid_strategy"] = copy.deepcopy(valid_strategy) context["valid_strategy"] = copy.deepcopy(valid_strategy)
# print("valid_strategy:", context["valid_strategy"])
# print("user_defined_strategy:", context["user_defined_strategy"])
applied_meta_list = self.strategy_compiler._get_applied_meta_list() applied_meta_list = self.strategy_compiler._get_applied_meta_list()
applied_graph_list = self.strategy_compiler._get_applied_graph_list() applied_graph_list = self.strategy_compiler._get_applied_graph_list()
...@@ -1539,13 +1558,17 @@ class Fleet(object): ...@@ -1539,13 +1558,17 @@ class Fleet(object):
loss, startup_program, parameter_list, no_grad_set=no_grad_set) loss, startup_program, parameter_list, no_grad_set=no_grad_set)
if meta_optimizer: if meta_optimizer:
# print("before minimize program id:", id(loss.block.program))
optimize_ops, params_grads = meta_optimizer.minimize( optimize_ops, params_grads = meta_optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set=no_grad_set) loss, startup_program, parameter_list, no_grad_set=no_grad_set)
# print("after minimize program id:", id(loss.block.program))
default_program = paddle.static.default_main_program() default_program = paddle.static.default_main_program()
# print("default program id:", id(default_program))
if id(default_program) != id(loss.block.program): if id(default_program) != id(loss.block.program):
paddle.fluid.framework.switch_main_program(loss.block.program) paddle.fluid.framework.switch_main_program(loss.block.program)
# print("default program id after switch:", id(default_program))
else: else:
optimize_ops, params_grads = self.user_defined_optimizer.minimize( optimize_ops, params_grads = self.user_defined_optimizer.minimize(
...@@ -1555,6 +1578,7 @@ class Fleet(object): ...@@ -1555,6 +1578,7 @@ class Fleet(object):
context["program_params_grads"] = params_grads context["program_params_grads"] = params_grads
if graph_optimizer: if graph_optimizer:
# print("before graph minimize program id:", id(loss.block.program))
optimize_ops, params_grads = graph_optimizer.minimize( optimize_ops, params_grads = graph_optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set=no_grad_set) loss, startup_program, parameter_list, no_grad_set=no_grad_set)
# since we do not encourage users to use graph operations # since we do not encourage users to use graph operations
...@@ -1568,13 +1592,90 @@ class Fleet(object): ...@@ -1568,13 +1592,90 @@ class Fleet(object):
if not self._role_maker._is_heter_parameter_server_mode: if not self._role_maker._is_heter_parameter_server_mode:
program = paddle.static.default_main_program() program = paddle.static.default_main_program()
opt_info = {} opt_info = {} if program._fleet_opt is None else program._fleet_opt
opt_info["mpi_size"] = self.worker_num()
opt_info["mpi_rank"] = self.worker_index()
for k, v in self._user_defined_strategy.trainer_desc_configs.items(
):
opt_info[k] = v
program._fleet_opt = opt_info
if self._runtime_handle is None:
self._runtime_handle = RuntimeFactory()._create_runtime(context)
import paddle.distributed.fleet as fleet
fleet.util._set_strategy(context["valid_strategy"])
return optimize_ops, params_grads
def _minimize_losses_impl(self,
losses,
startup_programs=None,
parameter_list=None,
no_grad_set=None):
context = {}
# cache original feed forward program
self.origin_main_program = losses[0].block.program
context["origin_main_program"] = self.origin_main_program
context["origin_main_programs"] = []
for loss in losses:
context["origin_main_programs"].append(loss.block.program)
context["loss"] = losses
if startup_programs is None:
if len(losses) == 1:
startup_programs = [paddle.static.default_startup_program()]
else:
raise ValueError(
"startup_program can't be None when loss is list.")
self.origin_startup_program = startup_programs[0].clone(for_test=False)
context["origin_startup_program"] = startup_programs[0]
context["origin_startup_programs"] = []
for program in startup_programs:
context["origin_startup_programs"].append(program)
context["role_maker"] = self._role_maker
context["user_defined_strategy"] = copy.deepcopy(
self._user_defined_strategy)
context["valid_strategy"] = copy.deepcopy(self._user_defined_strategy)
self._context = context
self.valid_strategy = context["valid_strategy"]
self.valid_strategy._enable_env()
optimize_ops = []
params_grads = []
from ..meta_optimizers import ParameterServerOptimizer
ps_optimizer = ParameterServerOptimizer(self.user_defined_optimizer)
ps_optimizer._set_basic_info(losses, self._role_maker,
self.user_defined_optimizer,
self._user_defined_strategy)
optimize_ops, params_grads = ps_optimizer.minimize_losses_impl(
losses, startup_programs, parameter_list, no_grad_set=no_grad_set)
# default_program = paddle.static.default_main_program()
# if id(default_program) != id(losses[0].block.program):
# paddle.fluid.framework.switch_main_program(losses[0].block.program)
context["program_optimize_ops"] = optimize_ops
context["program_params_grads"] = params_grads
for loss in losses:
program = loss.block.program
opt_info = {} if program._fleet_opt is None else program._fleet_opt
opt_info["mpi_size"] = self.worker_num() opt_info["mpi_size"] = self.worker_num()
opt_info["mpi_rank"] = self.worker_index() opt_info["mpi_rank"] = self.worker_index()
for k, v in self._user_defined_strategy.trainer_desc_configs.items( for k, v in self._user_defined_strategy.trainer_desc_configs.items(
): ):
opt_info[k] = v opt_info[k] = v
program._fleet_opt = opt_info program._fleet_opt = opt_info
# print("fleet base opt info:", id(program), program._fleet_opt)
if self._runtime_handle is None: if self._runtime_handle is None:
self._runtime_handle = RuntimeFactory()._create_runtime(context) self._runtime_handle = RuntimeFactory()._create_runtime(context)
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
from ..runtime.collective_runtime import CollectiveRuntime from ..runtime.collective_runtime import CollectiveRuntime
from ..runtime.parameter_server_runtime import ParameterServerRuntime from ..runtime.parameter_server_runtime import ParameterServerRuntime
from ..runtime.the_one_ps import TheOnePSRuntime from ...ps.the_one_ps import TheOnePSRuntime
__all__ = [] __all__ = []
......
...@@ -17,7 +17,7 @@ from .asp_optimizer import ASPOptimizer ...@@ -17,7 +17,7 @@ from .asp_optimizer import ASPOptimizer
from .recompute_optimizer import RecomputeOptimizer from .recompute_optimizer import RecomputeOptimizer
from .gradient_merge_optimizer import GradientMergeOptimizer from .gradient_merge_optimizer import GradientMergeOptimizer
from .graph_execution_optimizer import GraphExecutionOptimizer from .graph_execution_optimizer import GraphExecutionOptimizer
from .parameter_server_optimizer import ParameterServerOptimizer from .ps_optimizer import ParameterServerOptimizer
from .pipeline_optimizer import PipelineOptimizer from .pipeline_optimizer import PipelineOptimizer
from .localsgd_optimizer import LocalSGDOptimizer from .localsgd_optimizer import LocalSGDOptimizer
from .localsgd_optimizer import AdaptiveLocalSGDOptimizer from .localsgd_optimizer import AdaptiveLocalSGDOptimizer
......
...@@ -110,8 +110,9 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -110,8 +110,9 @@ class ParameterServerOptimizer(MetaOptimizerBase):
no_grad_set) no_grad_set)
if startup_program == None: if startup_program == None:
startup_program = paddle.static.default_startup_program() startup_program = paddle.static.default_startup_program()
print("program after inner optimizer minimize:",
str(loss.block.program)) # print("program after inner optimizer minimize:",
# str(loss.block.program))
self._set_origin_programs([loss]) self._set_origin_programs([loss])
self._init_ps_pass_context(loss, startup_program) self._init_ps_pass_context(loss, startup_program)
ps_builder = PsProgramBuilderFactory()._create_ps_program_builder( ps_builder = PsProgramBuilderFactory()._create_ps_program_builder(
...@@ -181,7 +182,6 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -181,7 +182,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
if not var.persistable or var.desc.type( if not var.persistable or var.desc.type(
) != core.VarDesc.VarType.LOD_TENSOR: ) != core.VarDesc.VarType.LOD_TENSOR:
continue continue
set_var_lod_type(var)
param_memory_size += get_var_mem_size(var) param_memory_size += get_var_mem_size(var)
processed_var_names.add(varname) processed_var_names.add(varname)
...@@ -211,9 +211,8 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -211,9 +211,8 @@ class ParameterServerOptimizer(MetaOptimizerBase):
data_count *= (-x) data_count *= (-x)
else: else:
data_count *= x data_count *= x
program_tmp_vars[var_name] = ( program_tmp_vars[var_name] = (data_count, neg_dim_count,
data_count, neg_dim_count, dtype_to_size[var.dtype])
vars_metatools.dtype_to_size[var.dtype])
for varname in program_tmp_vars: for varname in program_tmp_vars:
data_count, neg_dim_count, type_size = program_tmp_vars[varname] data_count, neg_dim_count, type_size = program_tmp_vars[varname]
...@@ -228,12 +227,19 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -228,12 +227,19 @@ class ParameterServerOptimizer(MetaOptimizerBase):
return False return False
def _enable_strategy(self, dist_strategy, context): def _enable_strategy(self, dist_strategy, context):
a_sync_configs = dist_strategy.a_sync_configs
if dist_strategy.a_sync_configs["k_steps"] >= 0: if dist_strategy.a_sync_configs["k_steps"] >= 0:
return return
dist_strategy.a_sync = True dist_strategy.a_sync = True
a_sync_configs = dist_strategy.a_sync_configs
is_geo = self._can_apply_geo(context["origin_main_program"]) is_geo = self._can_apply_geo(context["origin_main_program"])
dist_strategy.a_sync_configs["k_steps"] = 800 if is_geo else 0
a_sync_configs["k_steps"] = 800 if is_geo else 0
dist_strategy.a_sync_configs = a_sync_configs
def _disable_strategy(self, dist_strategy): def _disable_strategy(self, dist_strategy):
dist_strategy.a_sync = False dist_strategy.a_sync = False
a_sync_configs = dist_strategy.a_sync_configs
dist_strategy.a_sync_configs["k_steps"] = -1 dist_strategy.a_sync_configs["k_steps"] = -1
dist_strategy.a_sync_configs = a_sync_configs
...@@ -62,9 +62,9 @@ def get_default_accessor_proto(accessor, varname, o_main_program): ...@@ -62,9 +62,9 @@ def get_default_accessor_proto(accessor, varname, o_main_program):
if not accessor.HasField("accessor_class"): if not accessor.HasField("accessor_class"):
accessor.accessor_class = "CtrCommonAccessor" accessor.accessor_class = "CtrCommonAccessor"
if not accessor.HasField("fea_dim"): if not accessor.HasField("fea_dim"):
accessor.fea_dim = embedding_dim + 2 accessor.fea_dim = embedding_dim
if not accessor.HasField("embedx_dim"): if not accessor.HasField("embedx_dim"):
accessor.embedx_dim = embedding_dim - 1 accessor.embedx_dim = embedding_dim - 3
if not accessor.HasField("embedx_threshold"): if not accessor.HasField("embedx_threshold"):
accessor.embedx_threshold = 0 accessor.embedx_threshold = 0
...@@ -129,15 +129,15 @@ def check_embedding_dim(accessor, varname, o_main_program): ...@@ -129,15 +129,15 @@ def check_embedding_dim(accessor, varname, o_main_program):
embedding_dim = var.shape[1] embedding_dim = var.shape[1]
break break
fea_dim = accessor.fea_dim fea_dim = accessor.fea_dim
if fea_dim != embedding_dim + 2: if fea_dim != embedding_dim:
raise ValueError( raise ValueError(
"The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}". "The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}".
format(embedding_dim + 2, fea_dim)) format(embedding_dim, fea_dim))
embedx_dim = accessor.embedx_dim embedx_dim = accessor.embedx_dim
if embedx_dim != embedding_dim - 1: if embedx_dim != embedding_dim - 3:
raise ValueError( raise ValueError(
"The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}". "The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {}, but got {}".
format(embedding_dim - 1, embedx_dim)) format(embedding_dim - 3, embedx_dim))
class Accessor: class Accessor:
...@@ -927,7 +927,6 @@ class TheOnePSRuntime(RuntimeBase): ...@@ -927,7 +927,6 @@ class TheOnePSRuntime(RuntimeBase):
tables = [] tables = []
for idx, (name, ctx) in enumerate(send_ctx.items()): for idx, (name, ctx) in enumerate(send_ctx.items()):
print(" wxm python test send_ctx.items-->", idx, (name, ctx))
if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1: if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1:
continue continue
......
...@@ -75,7 +75,7 @@ class DistributedInfer: ...@@ -75,7 +75,7 @@ class DistributedInfer:
if self.sparse_table_maps is None: if self.sparse_table_maps is None:
self.sparse_table_maps = {} self.sparse_table_maps = {}
send_ctx = fleet.fleet._runtime_handle._communicator.send_ctx_ send_ctx = fleet.fleet._runtime_handle._send_ctx
for gradname, ctx in send_ctx.items(): for gradname, ctx in send_ctx.items():
if ctx.is_sparse: if ctx.is_sparse:
param = gradname.strip("@GRAD") param = gradname.strip("@GRAD")
......
...@@ -155,8 +155,6 @@ class AddListenAndServPass(PassBase): ...@@ -155,8 +155,6 @@ class AddListenAndServPass(PassBase):
main_program.global_block().append_op( main_program.global_block().append_op(
type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=opt) type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=opt)
attrs['cloned_main'] = main_program
@register_pass("add_rpc_global_flags_pass") @register_pass("add_rpc_global_flags_pass")
class AddRpcGlobalFlagsPass(PassBase): class AddRpcGlobalFlagsPass(PassBase):
......
...@@ -116,7 +116,7 @@ class DistributedOpsPass(PassBase): ...@@ -116,7 +116,7 @@ class DistributedOpsPass(PassBase):
def _check_conflict(self, other_pass): def _check_conflict(self, other_pass):
return True return True
def _push_sparse_fuse(self, _program, push_sparse_ops, attrs): def _push_sparse_fuse(self, _program, push_sparse_ops, attrs, use_cvm_op):
if attrs['use_ps_gpu']: if attrs['use_ps_gpu']:
return return
if len(push_sparse_ops) == 0: if len(push_sparse_ops) == 0:
...@@ -211,7 +211,8 @@ class DistributedOpsPass(PassBase): ...@@ -211,7 +211,8 @@ class DistributedOpsPass(PassBase):
"is_distributed": is_distributed, "is_distributed": is_distributed,
"padding_idx": padding_idx, "padding_idx": padding_idx,
"table_id": table_id, "table_id": table_id,
"size": self.emb_size[param] "size": self.emb_size[param],
"use_cvm_op": use_cvm_op
}) })
def _pull_sparse_fuse(self, _program, pull_sparse_ops, attrs, send_ctx): def _pull_sparse_fuse(self, _program, pull_sparse_ops, attrs, send_ctx):
...@@ -420,6 +421,7 @@ class DistributedOpsPass(PassBase): ...@@ -420,6 +421,7 @@ class DistributedOpsPass(PassBase):
pull_sparse_ids = {} pull_sparse_ids = {}
push_sparse_ops = {} push_sparse_ops = {}
ops = {} ops = {}
use_cvm_op = False
for op in _program.global_block().ops: for op in _program.global_block().ops:
if op.type in SPARSE_OP_TYPE_DICT.keys() \ if op.type in SPARSE_OP_TYPE_DICT.keys() \
and op.attr('remote_prefetch') is True: and op.attr('remote_prefetch') is True:
...@@ -433,6 +435,9 @@ class DistributedOpsPass(PassBase): ...@@ -433,6 +435,9 @@ class DistributedOpsPass(PassBase):
ids = pull_sparse_ids.get(param_name, []) ids = pull_sparse_ids.get(param_name, [])
ids.append(op.input("Ids")[0]) ids.append(op.input("Ids")[0])
pull_sparse_ids[param_name] = ids pull_sparse_ids[param_name] = ids
if op.type == 'cvm':
use_cvm_op = True
for op in _program.global_block().ops: for op in _program.global_block().ops:
if op.type in SPARSE_GRAD_OP_TYPE_DICT.keys(): if op.type in SPARSE_GRAD_OP_TYPE_DICT.keys():
param_name = op.input(SPARSE_GRAD_OP_TYPE_DICT[op.type])[0] param_name = op.input(SPARSE_GRAD_OP_TYPE_DICT[op.type])[0]
...@@ -442,16 +447,16 @@ class DistributedOpsPass(PassBase): ...@@ -442,16 +447,16 @@ class DistributedOpsPass(PassBase):
ops.append(op) ops.append(op)
push_sparse_ops[param_name] = ops push_sparse_ops[param_name] = ops
return pull_sparse_ops, push_sparse_ops return pull_sparse_ops, push_sparse_ops, use_cvm_op
def _apply_single_impl(self, main_program, startup_program, pass_ctx): def _apply_single_impl(self, main_program, startup_program, pass_ctx):
attrs = pass_ctx._attrs attrs = pass_ctx._attrs
pull_sparse_ops, push_sparse_ops = self._get_pull_sparse_ops( pull_sparse_ops, push_sparse_ops, use_cvm_op = self._get_pull_sparse_ops(
main_program, attrs) main_program, attrs)
send_ctx = get_the_one_send_context( send_ctx = get_the_one_send_context(
attrs, split_dense_table=attrs['is_heter_ps_mode']) attrs, split_dense_table=attrs['is_heter_ps_mode'])
self._pull_sparse_fuse(main_program, pull_sparse_ops, attrs, send_ctx) self._pull_sparse_fuse(main_program, pull_sparse_ops, attrs, send_ctx)
self._push_sparse_fuse(main_program, push_sparse_ops, attrs) self._push_sparse_fuse(main_program, push_sparse_ops, attrs, use_cvm_op)
@register_pass("delete_optimizer_pass") @register_pass("delete_optimizer_pass")
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
import warnings import warnings
import os import os
from paddle.distributed.fleet.proto import ps_pb2 import paddle.distributed.fleet.proto.the_one_ps_pb2 as ps_pb2
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
from paddle.fluid import core from paddle.fluid import core
...@@ -68,16 +68,30 @@ def check_embedding_dim(accessor_proto, varname, program_id, context): ...@@ -68,16 +68,30 @@ def check_embedding_dim(accessor_proto, varname, program_id, context):
print('new var: {}, {}, {}'.format(var, embedding_dim, print('new var: {}, {}, {}'.format(var, embedding_dim,
accessor_proto.fea_dim)) accessor_proto.fea_dim))
break break
fea_dim = accessor_proto.fea_dim fea_dim = accessor_proto.fea_dim
if fea_dim != embedding_dim + 2: if accessor_proto.accessor_class == "SparseAccessor":
raise ValueError( if fea_dim != embedding_dim + 2:
"The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}". raise ValueError(
format(embedding_dim + 2, fea_dim)) "The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}".
format(embedding_dim + 2, fea_dim))
else:
if fea_dim != embedding_dim:
raise ValueError(
"The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}".
format(embedding_dim, fea_dim))
embedx_dim = accessor_proto.embedx_dim embedx_dim = accessor_proto.embedx_dim
if embedx_dim != embedding_dim - 1: if accessor_proto.accessor_class == "SparseAccessor":
raise ValueError( if embedx_dim != embedding_dim - 1:
"The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}". raise ValueError(
format(embedding_dim - 1, embedx_dim)) "The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}".
format(embedding_dim - 1, embedx_dim))
else:
if embedx_dim != embedding_dim - 3:
raise ValueError(
"The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {}, but got {}".
format(embedding_dim - 3, embedx_dim))
class Service: class Service:
...@@ -119,11 +133,18 @@ class Accessor: ...@@ -119,11 +133,18 @@ class Accessor:
break break
if not accessor_proto.HasField("accessor_class"): if not accessor_proto.HasField("accessor_class"):
accessor_proto.accessor_class = "CtrCommonAccessor" # DownpourSparseValueAccessor
accessor_proto.accessor_class = "SparseAccessor"
if not accessor_proto.HasField("fea_dim"): if not accessor_proto.HasField("fea_dim"):
accessor_proto.fea_dim = embedding_dim + 2 if accessor_proto.accessor_class == "SparseAccessor":
accessor_proto.fea_dim = embedding_dim + 2
else:
accessor_proto.fea_dim = embedding_dim
if not accessor_proto.HasField("embedx_dim"): if not accessor_proto.HasField("embedx_dim"):
accessor_proto.embedx_dim = embedding_dim - 1 if accessor_proto.accessor_class == "SparseAccessor":
accessor_proto.embedx_dim = embedding_dim - 1
else:
accessor_proto.embedx_dim = embedding_dim - 3
if not accessor_proto.HasField("embedx_threshold"): if not accessor_proto.HasField("embedx_threshold"):
accessor_proto.embedx_threshold = 0 accessor_proto.embedx_threshold = 0
...@@ -268,16 +289,16 @@ class CommonAccessor(Accessor): ...@@ -268,16 +289,16 @@ class CommonAccessor(Accessor):
attr_str = "" attr_str = ""
origin_var_name = value_name origin_var_name = value_name
print("get_initializer_attr param name:", value_name) # print("get_initializer_attr param name:", value_name)
for op in o_startup_program.global_block().ops: for op in o_startup_program.global_block().ops:
if op.type in self.opt_init_map.keys( if op.type in self.opt_init_map.keys(
) and origin_var_name == op.output("Out")[0]: ) and origin_var_name == op.output("Out")[0]:
init_attr = [op.type] init_attr = [op.type]
print("get_initializer_attr op type:", op.type) # print("get_initializer_attr op type:", op.type)
for attr in self.opt_init_map[op.type]: for attr in self.opt_init_map[op.type]:
print("get_initializer_attr opt_init_map attr:", attr) # print("get_initializer_attr opt_init_map attr:", attr)
init_attr.append(str(op.attr(attr))) init_attr.append(str(op.attr(attr)))
print("get_initializer_attr op attr:", str(op.attr(attr))) # print("get_initializer_attr op attr:", str(op.attr(attr)))
attr_str = l_in.join(init_attr) attr_str = l_in.join(init_attr)
break break
return attr_str return attr_str
...@@ -288,16 +309,16 @@ class CommonAccessor(Accessor): ...@@ -288,16 +309,16 @@ class CommonAccessor(Accessor):
size = ctx.sections()[0] size = ctx.sections()[0]
single_dim = ctx.sections()[1] if ctx.is_sparse() else 1 single_dim = ctx.sections()[1] if ctx.is_sparse() else 1
adam_d2sum = context["user_defined_strategy"].adam_d2sum adam_d2sum = context["user_defined_strategy"].adam_d2sum
print("parse_by_optimizer table_id:{} is_datanorm:{}".format( # print("parse_by_optimizer table_id:{} is_datanorm:{}".format(
ctx.table_id(), ctx.is_datanorm_table())) # ctx.table_id(), ctx.is_datanorm_table()))
main_program, startup_program, idx = get_program_by_id(context, main_program, startup_program, idx = get_program_by_id(context,
ctx.program_id()) ctx.program_id())
pserver_id = get_role_id(context['role_maker']) pserver_id = get_role_id(context['role_maker'])
pserver_num = len(get_ps_endpoints(context['role_maker'])) pserver_num = len(get_ps_endpoints(context['role_maker']))
optimizer_ops = get_optimize_ops(main_program) optimizer_ops = get_optimize_ops(main_program)
print("the one ps optimizer_ops:", optimizer_ops) # print("the one ps optimizer_ops:", optimizer_ops)
print("the one ps parse_by_optimizer grad_name:", grad_name) # print("the one ps parse_by_optimizer grad_name:", grad_name)
oop = None oop = None
for op in optimizer_ops: for op in optimizer_ops:
...@@ -394,7 +415,7 @@ class CommonAccessor(Accessor): ...@@ -394,7 +415,7 @@ class CommonAccessor(Accessor):
initializer = self.get_initializer_attr(param.name, initializer = self.get_initializer_attr(param.name,
startup_program) startup_program)
elif formal_name == "SummaryDecayRate": elif formal_name == "SummaryDecayRate":
initializer = "fill_constant&0.99999" initializer = "fill_constant&0.999999"
else: else:
initializer = "fill_constant&0" initializer = "fill_constant&0"
initializers.append(initializer) initializers.append(initializer)
...@@ -740,7 +761,6 @@ class PsDescBuilder(object): ...@@ -740,7 +761,6 @@ class PsDescBuilder(object):
def _get_tables(self): def _get_tables(self):
tables = [] tables = []
for idx, (name, ctx) in enumerate(self.send_ctx.items()): for idx, (name, ctx) in enumerate(self.send_ctx.items()):
print('####### {}\n'.format(ctx.is_sparse()))
if ctx.is_sparse(): if ctx.is_sparse():
if self.ps_mode == DistributedMode.GEO: if self.ps_mode == DistributedMode.GEO:
tables.append(globals()['GeoSparseTable'](self.context, tables.append(globals()['GeoSparseTable'](self.context,
...@@ -778,11 +798,11 @@ class PsDescBuilder(object): ...@@ -778,11 +798,11 @@ class PsDescBuilder(object):
return text_format.MessageToString(self.ps_desc) return text_format.MessageToString(self.ps_desc)
def build_server_desc(self): def build_server_desc(self):
self.sparse_table_maps = {}
for table in self.tables: for table in self.tables:
table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add( table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add(
) )
table._set(table_proto) table._set(table_proto)
self.sparse_table_maps = {}
if table_proto.type == ps_pb2.PS_SPARSE_TABLE and table_proto.common is not None: if table_proto.type == ps_pb2.PS_SPARSE_TABLE and table_proto.common is not None:
self.sparse_table_maps[ self.sparse_table_maps[
table_proto.common.table_name] = table_proto.table_id table_proto.common.table_name] = table_proto.table_id
...@@ -801,6 +821,7 @@ class TheOnePSRuntime(RuntimeBase): ...@@ -801,6 +821,7 @@ class TheOnePSRuntime(RuntimeBase):
self._worker = fluid.core.DistFleetWrapper() self._worker = fluid.core.DistFleetWrapper()
self._server_sub_program = [] self._server_sub_program = []
self._heter_client = None self._heter_client = None
self._send_ctx = None
def _set_basic_info(self, context): def _set_basic_info(self, context):
self.context = context self.context = context
...@@ -835,7 +856,40 @@ class TheOnePSRuntime(RuntimeBase): ...@@ -835,7 +856,40 @@ class TheOnePSRuntime(RuntimeBase):
self.ps_desc_builder = PsDescBuilder(self.context) self.ps_desc_builder = PsDescBuilder(self.context)
def _init_worker(self): def _init_params(self, scopes, send_ctx, recv_map):
for name, ctx in send_ctx.items():
if ctx.is_sparse():
continue
_, _, idx = get_program_by_id(self.context, ctx.program_id())
scope = scopes[idx]
table_id = ctx.table_id()
var_names = recv_map[table_id]
# print("init params:", idx, table_id, var_names)
self._worker.push_dense_params(scope, table_id, var_names)
def _pull_all_dense(self, scopes, send_ctx, recv_map):
for name, ctx in send_ctx.items():
if ctx.is_sparse():
continue
_, _, idx = get_program_by_id(self.context, ctx.program_id())
scope = scopes[idx]
table_id = ctx.table_id()
var_names = recv_map[table_id]
# print("pull all dense:", idx, table_id, var_names)
self._worker.pull_dense_params(scope, table_id, var_names)
def _pull_dense(self, program, scope, send_ctx, recv_map):
for name, ctx in send_ctx.items():
if ctx.is_sparse():
continue
if ctx.program_id() != id(program):
continue
table_id = ctx.table_id()
var_names = recv_map[table_id]
# print("pull dense:", table_id, var_names)
self._worker.pull_dense_params(scope, table_id, var_names)
def _init_worker(self, scopes=None):
worker_desc = self.ps_desc_builder.build_worker_desc() worker_desc = self.ps_desc_builder.build_worker_desc()
if self.context['use_ps_gpu']: if self.context['use_ps_gpu']:
...@@ -866,6 +920,7 @@ class TheOnePSRuntime(RuntimeBase): ...@@ -866,6 +920,7 @@ class TheOnePSRuntime(RuntimeBase):
split_dense_table=self.is_heter_ps_mode, split_dense_table=self.is_heter_ps_mode,
use_origin_program=self.is_heter_ps_mode, use_origin_program=self.is_heter_ps_mode,
ep_list=self.endpoints) ep_list=self.endpoints)
self._send_ctx = send_ctx
trainer_config = self.context['trainer'] trainer_config = self.context['trainer']
debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
...@@ -889,23 +944,32 @@ class TheOnePSRuntime(RuntimeBase): ...@@ -889,23 +944,32 @@ class TheOnePSRuntime(RuntimeBase):
kwargs.update(sync_kwargs) kwargs.update(sync_kwargs)
print("communicator config:", trainer_config.get_communicator_flags()) print("communicator config:", trainer_config.get_communicator_flags())
self._communicator = Communicator(
trainer_config.mode, kwargs,
trainer_config.get_communicator_flags())
self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt,
self.string_hosts,
fluid.global_scope())
role_id = get_role_id(self.role_maker)
self._worker.init_worker(proto_txt, self.string_hosts, role_id)
if self.context['ps_mode'] == DistributedMode.GEO:
self._communicator = Communicator(
trainer_config.mode, kwargs,
trainer_config.get_communicator_flags())
self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt,
self.string_hosts,
fluid.global_scope())
fleet.util.barrier() fleet.util.barrier()
info = self._communicator.get_client_info()
# info = self._communicator.get_client_info()
info = self._worker.get_client_info()
if isinstance(info, list) and len(info) > 0: if isinstance(info, list) and len(info) > 0:
all_info = self.role_maker._all_gather(info[0]) all_info = self.role_maker._all_gather(info[0])
# for unittest # for unittest
if not isinstance(all_info, list): if not isinstance(all_info, list):
warnings.warn("gloo may not initialize correctly") warnings.warn("gloo may not initialize correctly")
all_info = [all_info] all_info = [all_info]
self._communicator.set_clients(all_info)
self._communicator.create_client_to_client_connection() # self._communicator.set_clients(all_info)
# self._communicator.create_client_to_client_connection()
self._worker.set_clients(all_info)
self._worker.create_client2client_connection()
print('create c2c connection done') print('create c2c connection done')
else: else:
print('cannot create c2c connection') print('cannot create c2c connection')
...@@ -914,6 +978,7 @@ class TheOnePSRuntime(RuntimeBase): ...@@ -914,6 +978,7 @@ class TheOnePSRuntime(RuntimeBase):
is_test = bool(int(os.getenv("TEST_MODE", "0"))) is_test = bool(int(os.getenv("TEST_MODE", "0")))
# for GEO
if self.role_maker._is_first_worker() and self.is_heter_ps_mode: if self.role_maker._is_first_worker() and self.is_heter_ps_mode:
# for ps-heter mode load all parameters on first_worker # for ps-heter mode load all parameters on first_worker
init_params = get_the_one_recv_context( init_params = get_the_one_recv_context(
...@@ -921,16 +986,38 @@ class TheOnePSRuntime(RuntimeBase): ...@@ -921,16 +986,38 @@ class TheOnePSRuntime(RuntimeBase):
else: else:
init_params = dense_map init_params = dense_map
# if not is_test:
# self._communicator.init_params(init_params)
# fleet.util.barrier()
# self._communicator.pull_dense(init_params)
# fleet.util.barrier()
if scopes is None:
if len(self.origin_main_programs) > 1:
raise ValueError(
"You must set the scope list when you have Multiple programs"
)
scopes = [fluid.global_scope()]
if len(self.origin_main_programs) != len(scopes):
raise VauleError("len(programs) != len(scopes)")
self.scopes = scopes
if not is_test: if not is_test:
self._communicator.init_params(init_params) if self.context['ps_mode'] == DistributedMode.GEO:
self._communicator.init_params(init_params)
else:
if role_id == 0:
self._init_params(scopes, send_ctx, dense_map)
fleet.util.barrier() fleet.util.barrier()
self._communicator.pull_dense(init_params) self._pull_all_dense(scopes, send_ctx, dense_map)
fleet.util.barrier() fleet.util.barrier()
if not self._communicator.is_running(): if self.context['ps_mode'] == DistributedMode.GEO:
self._communicator.start() if not self._communicator.is_running():
else: self._communicator.start()
warnings.warn("communicator has been initialized, skip") else:
warnings.warn("communicator has been initialized, skip")
launch_barrier = dist_strategy.a_sync_configs["launch_barrier"] launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1")) launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
...@@ -996,7 +1083,9 @@ class TheOnePSRuntime(RuntimeBase): ...@@ -996,7 +1083,9 @@ class TheOnePSRuntime(RuntimeBase):
self._server.run_server(host, int(port)) self._server.run_server(host, int(port))
def _stop_worker(self): def _stop_worker(self):
self._communicator.stop() if self.context['ps_mode'] == DistributedMode.GEO:
self._communicator.stop()
self._worker.stop_worker()
if self.is_heter_ps_mode: if self.is_heter_ps_mode:
assert self._heter_client != None, "heter client should not be None in heterps mode" assert self._heter_client != None, "heter client should not be None in heterps mode"
self._heter_client.stop() self._heter_client.stop()
...@@ -1151,7 +1240,11 @@ class TheOnePSRuntime(RuntimeBase): ...@@ -1151,7 +1240,11 @@ class TheOnePSRuntime(RuntimeBase):
"in fleet.save() function, executor must be as Executor type") "in fleet.save() function, executor must be as Executor type")
import paddle import paddle
program = self.origin_main_program if main_program is None else main_program program = self.origin_main_programs[
0] if main_program is None else main_program
_, _, idx = get_program_by_id(self.context, id(program))
scope = self.scopes[idx]
print("save inference model scope idx:", idx)
if isinstance(program, CompiledProgram): if isinstance(program, CompiledProgram):
raise TypeError( raise TypeError(
...@@ -1180,12 +1273,14 @@ class TheOnePSRuntime(RuntimeBase): ...@@ -1180,12 +1273,14 @@ class TheOnePSRuntime(RuntimeBase):
sparse_names = self._save_sparse_params(executor, dirname, sparses, sparse_names = self._save_sparse_params(executor, dirname, sparses,
main_program, mode) main_program, mode)
denses = get_the_one_recv_context( dense_map = get_the_one_recv_context(
self.context, split_dense_table=self.is_heter_ps_mode)
send_ctx = get_the_one_send_context(
self.context, self.context,
is_dense=True,
split_dense_table=self.is_heter_ps_mode, split_dense_table=self.is_heter_ps_mode,
use_origin_program=True) use_origin_program=self.is_heter_ps_mode,
self._communicator.pull_dense(denses) ep_list=self.endpoints)
self._pull_dense(program, scope, send_ctx, dense_map)
generate_vars = self.context[ generate_vars = self.context[
"user_defined_strategy"].trainer_desc_configs["stat_var_names"] "user_defined_strategy"].trainer_desc_configs["stat_var_names"]
...@@ -1196,7 +1291,7 @@ class TheOnePSRuntime(RuntimeBase): ...@@ -1196,7 +1291,7 @@ class TheOnePSRuntime(RuntimeBase):
infer_program.list_vars())) infer_program.list_vars()))
for var in remaining_vars: for var in remaining_vars:
tensor = var.get_value() tensor = var.get_value(scope)
paddle.save( paddle.save(
tensor, tensor,
os.path.join(model_path, var.name), os.path.join(model_path, var.name),
......
...@@ -37,6 +37,37 @@ class PsProgramBuilder(object): ...@@ -37,6 +37,37 @@ class PsProgramBuilder(object):
self.server_endpoints = self.attrs['role_maker']._get_pserver_endpoints( self.server_endpoints = self.attrs['role_maker']._get_pserver_endpoints(
) )
def _build_trainer_desc(self):
opt_info = self.loss.block.program._fleet_opt
opt_info = {} if opt_info is None else opt_info
opt_info["trainer"] = opt_info.get("trainer", "DistMultiTrainer")
opt_info["device_worker"] = opt_info.get("device_worker",
"DownpourLite")
pid = str(id(self.cloned_main))
program_configs = {
pid: {
'pull_dense': [],
'push_dense': [],
'pull_sparse': [],
'push_sparse': []
}
}
dense_table_config = {}
send_ctx = get_the_one_send_context(self.attrs)
recv_ctx = get_the_one_recv_context(self.attrs)
for name, ctx in send_ctx.items():
if ctx.program_id() != id(self.loss.block.program):
continue
if ctx.is_sparse():
continue
if not ctx.is_tensor_table():
program_configs[pid]['pull_dense'].append(ctx.table_id())
program_configs[pid]['push_dense'].append(ctx.table_id())
dense_table_config[ctx.table_id()] = recv_ctx[ctx.table_id()]
opt_info['program_configs'] = program_configs
opt_info['dense_table_config'] = dense_table_config
self.cloned_main._fleet_opt = opt_info
def _optimize_programs(self): def _optimize_programs(self):
pass pass
...@@ -63,7 +94,15 @@ class PsProgramBuilder(object): ...@@ -63,7 +94,15 @@ class PsProgramBuilder(object):
logger.info("start building trainer program") logger.info("start building trainer program")
self._build_trainer_programs() self._build_trainer_programs()
fluid.framework.switch_startup_program(self.cloned_startup) fluid.framework.switch_startup_program(self.cloned_startup)
# print("ps_program_build before =", id(self.loss.block.program))
self._build_trainer_desc()
self.loss.block.program = self.cloned_main self.loss.block.program = self.cloned_main
# print("ps_program_build after =", id(self.loss.block.program))
# print("ps_program_build clone after =", id(self.cloned_main))
# print("ps_program_build after trainer_desc",
# id(self.loss.block.program))
# print("ps_program build trainer desc",
# self.loss.block.program._fleet_opt)
elif self.attrs['is_server']: elif self.attrs['is_server']:
logger.info("start building pserver program") logger.info("start building pserver program")
...@@ -92,6 +131,13 @@ class GeoPsProgramBuilder(PsProgramBuilder): # 仅 CPU 模式 ...@@ -92,6 +131,13 @@ class GeoPsProgramBuilder(PsProgramBuilder): # 仅 CPU 模式
return return
def _build_pserver_programs(self):
add_listen_and_serv_pass = new_pass('add_listen_and_serv_pass',
self.attrs)
add_listen_and_serv_pass.apply([self.attrs['_main_server']], [None],
self.pass_ctx)
return
class CpuSyncPsProgramBuilder(PsProgramBuilder): class CpuSyncPsProgramBuilder(PsProgramBuilder):
def __init__(self, pass_ctx): def __init__(self, pass_ctx):
...@@ -103,13 +149,13 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder): ...@@ -103,13 +149,13 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
format(self.ps_mode, "PsProgramBuilder")) format(self.ps_mode, "PsProgramBuilder"))
def _build_trainer_programs(self): def _build_trainer_programs(self):
print("build trainer program entry") # print("build trainer program entry")
print("before ps program builder program:", self.cloned_main) # print("before ps program builder program:", self.cloned_main)
add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass", add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
self.attrs) self.attrs)
add_lr_decay_table_pass.apply([], [], self.pass_ctx) add_lr_decay_table_pass.apply([], [], self.pass_ctx)
print("before distributed op pass") # print("before distributed op pass")
distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs) distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs)
distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)
...@@ -129,7 +175,7 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder): ...@@ -129,7 +175,7 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
self.attrs['origin_main_program'] = self.cloned_main self.attrs['origin_main_program'] = self.cloned_main
self.attrs['origin_startup_program'] = self.cloned_startup self.attrs['origin_startup_program'] = self.cloned_startup
print("after ps program builder program:", self.cloned_main) # print("after ps program builder program:", self.cloned_main)
if self.launch_barrier and self.launch_barrier_flag: if self.launch_barrier and self.launch_barrier_flag:
wait_server_ready(self.server_endpoints) wait_server_ready(self.server_endpoints)
......
...@@ -23,7 +23,6 @@ import logging ...@@ -23,7 +23,6 @@ import logging
import six import six
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.core import CommContext
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
...@@ -73,9 +72,9 @@ def logger_config(log_path, logging_name): ...@@ -73,9 +72,9 @@ def logger_config(log_path, logging_name):
return logger return logger
ps_log_root_dir = '/ps_log/' ps_log_root_dir = './ps_log/'
logger = logger_config( logger = logger_config(
log_path='/ps_usr_print_log', logging_name='ps_usr_print_log') log_path='./ps_usr_print_log', logging_name='ps_usr_print_log')
class DistributedMode: class DistributedMode:
...@@ -342,6 +341,7 @@ def get_dense_send_context(program, ...@@ -342,6 +341,7 @@ def get_dense_send_context(program,
aggregate = True aggregate = True
print("public get_dense_send_context dense_table:", grad_name, print("public get_dense_send_context dense_table:", grad_name,
var_numel, origin_varnames) var_numel, origin_varnames)
from paddle.fluid.core import CommContext
dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"], dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
[var_numel], origin_varnames, trainer_id, [var_numel], origin_varnames, trainer_id,
aggregate, False, False, idx, False, False, aggregate, False, False, idx, False, False,
...@@ -364,6 +364,7 @@ def get_dense_send_context(program, ...@@ -364,6 +364,7 @@ def get_dense_send_context(program,
aggregate = True aggregate = True
print("public get_dense_send_context data_norm table:", grad_name, print("public get_dense_send_context data_norm table:", grad_name,
var_numel, origin_varnames) var_numel, origin_varnames)
from paddle.fluid.core import CommContext
data_norm_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"], data_norm_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
[var_numel], origin_varnames, trainer_id, [var_numel], origin_varnames, trainer_id,
aggregate, False, False, idx, False, True, aggregate, False, False, idx, False, True,
...@@ -378,6 +379,7 @@ def get_dense_send_context(program, ...@@ -378,6 +379,7 @@ def get_dense_send_context(program,
var_numel = reduce(lambda x, y: x * y, var.shape) var_numel = reduce(lambda x, y: x * y, var.shape)
grad_name = origin_varname grad_name = origin_varname
aggregate = True aggregate = True
from paddle.fluid.core import CommContext
dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"], dense_ctx = CommContext(grad_name, [grad_name], ["127.0.0.1:6071"],
[var_numel], [origin_varname], trainer_id, [var_numel], [origin_varname], trainer_id,
aggregate, False, False, idx, False, False, aggregate, False, False, idx, False, False,
...@@ -407,7 +409,7 @@ def get_geo_trainer_send_context(context): ...@@ -407,7 +409,7 @@ def get_geo_trainer_send_context(context):
var = program.global_block().vars[grad.merged_var.name] var = program.global_block().vars[grad.merged_var.name]
var_numel = reduce(lambda x, y: x * y, var.shape[1:]) var_numel = reduce(lambda x, y: x * y, var.shape[1:])
from paddle.fluid.core import CommContext
sparse_ctx = CommContext(grad_name, [grad_name], sparse_ctx = CommContext(grad_name, [grad_name],
["127.0.0.1:6071"], [var_numel], ["127.0.0.1:6071"], [var_numel],
[grad_name], trainer_id, True, True, [grad_name], trainer_id, True, True,
...@@ -432,6 +434,7 @@ def _step_ctx(idx, role_maker): ...@@ -432,6 +434,7 @@ def _step_ctx(idx, role_maker):
endpoints = get_ps_endpoints(role_maker) endpoints = get_ps_endpoints(role_maker)
sections = [1] * len(endpoints) sections = [1] * len(endpoints)
names = [name] * len(endpoints) names = [name] * len(endpoints)
from paddle.fluid.core import CommContext
ctx = CommContext(name, names, endpoints, sections, [name], trainer_id, ctx = CommContext(name, names, endpoints, sections, [name], trainer_id,
True, False, False, idx, True, False, -1) True, False, False, idx, True, False, -1)
return name, ctx return name, ctx
...@@ -448,12 +451,8 @@ def get_the_one_send_context(context, ...@@ -448,12 +451,8 @@ def get_the_one_send_context(context,
origin_programs = context['origin_main_programs'] origin_programs = context['origin_main_programs']
idx = 0 idx = 0
for i, program in enumerate(origin_programs):
merged_dense_pairs = context['merged_dense_pairs'][i]
idx = get_dense_send_context(program, send_ctx, idx, merged_dense_pairs,
trainer_id, split_dense_table)
distibuted_varnames = get_sparse_tablenames(origin_programs, True) distibuted_varnames = get_sparse_tablenames(origin_programs, True)
print("public distibuted_varnames:", distibuted_varnames) # print("public distibuted_varnames:", distibuted_varnames)
for i, program in enumerate(origin_programs): for i, program in enumerate(origin_programs):
merged_sparse_pairs = context['merged_sparse_pairs'][i] merged_sparse_pairs = context['merged_sparse_pairs'][i]
for merged in merged_sparse_pairs: for merged in merged_sparse_pairs:
...@@ -472,10 +471,11 @@ def get_the_one_send_context(context, ...@@ -472,10 +471,11 @@ def get_the_one_send_context(context,
shape = list(var.shape) shape = list(var.shape)
shape[0] = 0 if is_distributed else shape[0] shape[0] = 0 if is_distributed else shape[0]
print("public get_the_one_send_context sparse:", grad_name, # print("public get_the_one_send_context sparse:", grad_name,
splited_varname, shape) # splited_varname, shape)
if grad_name in send_ctx: if grad_name in send_ctx:
continue continue
from paddle.fluid.core import CommContext
sparse_ctx = CommContext(grad_name, splited_varname, ep_list, shape, sparse_ctx = CommContext(grad_name, splited_varname, ep_list, shape,
[grad_name], trainer_id, True, True, [grad_name], trainer_id, True, True,
is_distributed, idx, False, False, is_distributed, idx, False, False,
...@@ -484,6 +484,11 @@ def get_the_one_send_context(context, ...@@ -484,6 +484,11 @@ def get_the_one_send_context(context,
idx += 1 idx += 1
send_ctx[sparse_ctx.var_name()] = sparse_ctx send_ctx[sparse_ctx.var_name()] = sparse_ctx
for i, program in enumerate(origin_programs):
merged_dense_pairs = context['merged_dense_pairs'][i]
idx = get_dense_send_context(program, send_ctx, idx, merged_dense_pairs,
trainer_id, split_dense_table)
if len(context['tensor_table']) > 0 and context['is_worker']: if len(context['tensor_table']) > 0 and context['is_worker']:
name, ctx = _step_ctx(idx, context['role_maker']) name, ctx = _step_ctx(idx, context['role_maker'])
send_ctx[name] = ctx send_ctx[name] = ctx
...@@ -1258,8 +1263,8 @@ def build_var_distributed(context): ...@@ -1258,8 +1263,8 @@ def build_var_distributed(context):
context["merged_variable_map"] = {} context["merged_variable_map"] = {}
for origin_program in origin_programs: for origin_program in origin_programs:
sparse_pairs, dense_pairs = get_param_grads(origin_program) sparse_pairs, dense_pairs = get_param_grads(origin_program)
print("public build_var_distributed sparse_pairs:", sparse_pairs) # print("public build_var_distributed sparse_pairs:", sparse_pairs)
print("public build_var_distributed dense_pairs:", dense_pairs) # print("public build_var_distributed dense_pairs:", dense_pairs)
origin_for_sparse = [] origin_for_sparse = []
origin_for_dense = [] origin_for_dense = []
merged_sparse_pairs = [] merged_sparse_pairs = []
...@@ -1279,8 +1284,8 @@ def build_var_distributed(context): ...@@ -1279,8 +1284,8 @@ def build_var_distributed(context):
m_grad = MergedVariable(grad, [grad], [0]) m_grad = MergedVariable(grad, [grad], [0])
merged_variables_pairs.append((m_param, m_grad)) merged_variables_pairs.append((m_param, m_grad))
merged_dense_pairs.append((m_param, m_grad)) merged_dense_pairs.append((m_param, m_grad))
print("public build_var_distributed merged_dense_pairs:", # print("public build_var_distributed merged_dense_pairs:",
merged_dense_pairs) # merged_dense_pairs)
for sparse_pair in origin_for_sparse: for sparse_pair in origin_for_sparse:
param, grad = sparse_pair param, grad = sparse_pair
...@@ -1289,8 +1294,8 @@ def build_var_distributed(context): ...@@ -1289,8 +1294,8 @@ def build_var_distributed(context):
m_grad = MergedVariable(grad, [grad], [0]) m_grad = MergedVariable(grad, [grad], [0])
merged_variables_pairs.append((m_param, m_grad)) merged_variables_pairs.append((m_param, m_grad))
merged_sparse_pairs.append((m_param, m_grad)) merged_sparse_pairs.append((m_param, m_grad))
print("public build_var_distributed merged_sparse_pairs:", # print("public build_var_distributed merged_sparse_pairs:",
merged_sparse_pairs) # merged_sparse_pairs)
for merged in merged_variables_pairs: for merged in merged_variables_pairs:
m_param, m_grad = merged m_param, m_grad = merged
...@@ -1315,18 +1320,19 @@ def build_var_distributed(context): ...@@ -1315,18 +1320,19 @@ def build_var_distributed(context):
context["param_name_to_grad_name"] = param_name_to_grad_name context["param_name_to_grad_name"] = param_name_to_grad_name
context["grad_name_to_param_name"] = grad_name_to_param_name context["grad_name_to_param_name"] = grad_name_to_param_name
print("public build_var_distributed origin_sparse_pairs:",
context["origin_sparse_pairs"]) # print("public build_var_distributed origin_sparse_pairs:",
print("public build_var_distributed origin_for_dense:", # context["origin_sparse_pairs"])
context["origin_dense_pairs"]) # print("public build_var_distributed origin_for_dense:",
print("public build_var_distributed merged_sparse_pairs:", # context["origin_dense_pairs"])
context["merged_sparse_pairs"]) # print("public build_var_distributed merged_sparse_pairs:",
print("public build_var_distributed merged_dense_pairs:", # context["merged_sparse_pairs"])
context['merged_dense_pairs']) # print("public build_var_distributed merged_dense_pairs:",
print("public build_var_distributed param_name_to_grad_name:", # context['merged_dense_pairs'])
param_name_to_grad_name) # print("public build_var_distributed param_name_to_grad_name:",
print("public build_var_distributed grad_name_to_param_name:", # param_name_to_grad_name)
grad_name_to_param_name) # print("public build_var_distributed grad_name_to_param_name:",
# grad_name_to_param_name)
def _is_opt_role_op(op): def _is_opt_role_op(op):
......
...@@ -62,13 +62,18 @@ class Communicator(object): ...@@ -62,13 +62,18 @@ class Communicator(object):
""" """
# set all recv op to not_run mode # set all recv op to not_run mode
if mode == DistributedMode.SYNC: if kwargs == None:
envs["pserver_endpoints"] = ','.join(kwargs["pserver_endpoints"]) if envs == None:
envs = {}
envs["trainers"] = str(kwargs["trainers"]) else:
envs["trainer_id"] = str(kwargs["trainer_id"]) if mode == DistributedMode.SYNC:
envs["need_global_step"] = str(kwargs["need_global_step"]) envs["pserver_endpoints"] = ','.join(kwargs[
envs["barrier_table_id"] = str(kwargs["barrier_table_id"]) "pserver_endpoints"])
envs["trainers"] = str(kwargs["trainers"])
envs["trainer_id"] = str(kwargs["trainer_id"])
envs["need_global_step"] = str(kwargs["need_global_step"])
envs["barrier_table_id"] = str(kwargs["barrier_table_id"])
mode_str = None mode_str = None
...@@ -129,6 +134,9 @@ class Communicator(object): ...@@ -129,6 +134,9 @@ class Communicator(object):
comm.start() comm.start()
comm.stop() comm.stop()
""" """
if self.communicator_ == None:
print('you must call init_with_ctx first to init comm before start')
return
self.communicator_.start() self.communicator_.start()
def stop(self): def stop(self):
...@@ -148,6 +156,9 @@ class Communicator(object): ...@@ -148,6 +156,9 @@ class Communicator(object):
comm.start() comm.start()
comm.stop() comm.stop()
""" """
if self.communicator_ == None:
print('you must call init_with_ctx first to init comm before stop')
return
self.communicator_.stop() self.communicator_.stop()
def is_running(self): def is_running(self):
...@@ -166,6 +177,9 @@ class Communicator(object): ...@@ -166,6 +177,9 @@ class Communicator(object):
comm = fluid.communicator.Communicator(prog) comm = fluid.communicator.Communicator(prog)
comm.is_running() comm.is_running()
""" """
if self.communicator_ == None:
print('you must call init_with_ctx first to init comm before stop')
return
self.communicator_.is_running() self.communicator_.is_running()
def recv(self): def recv(self):
......
...@@ -862,9 +862,9 @@ class InMemoryDataset(DatasetBase): ...@@ -862,9 +862,9 @@ class InMemoryDataset(DatasetBase):
thread_num(int): shuffle thread num. Default is 12. thread_num(int): shuffle thread num. Default is 12.
""" """
from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
if fleet is not None: if fleet is not None:
if not isinstance(fleet, PSLib): if hasattr(fleet, "barrier_worker"):
print("pscore fleet")
fleet.barrier_worker() fleet.barrier_worker()
else: else:
fleet._role_maker.barrier_worker() fleet._role_maker.barrier_worker()
...@@ -879,20 +879,20 @@ class InMemoryDataset(DatasetBase): ...@@ -879,20 +879,20 @@ class InMemoryDataset(DatasetBase):
self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size) self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size)
self.dataset.set_fleet_send_sleep_seconds(self.fleet_send_sleep_seconds) self.dataset.set_fleet_send_sleep_seconds(self.fleet_send_sleep_seconds)
if fleet is not None: if fleet is not None:
if not isinstance(fleet, PSLib): if hasattr(fleet, "barrier_worker"):
fleet.barrier_worker() fleet.barrier_worker()
else: else:
fleet._role_maker.barrier_worker() fleet._role_maker.barrier_worker()
self.dataset.global_shuffle(thread_num) self.dataset.global_shuffle(thread_num)
if fleet is not None: if fleet is not None:
if not isinstance(fleet, PSLib): if hasattr(fleet, "barrier_worker"):
fleet.barrier_worker() fleet.barrier_worker()
else: else:
fleet._role_maker.barrier_worker() fleet._role_maker.barrier_worker()
if self.merge_by_lineid: if self.merge_by_lineid:
self.dataset.merge_by_lineid() self.dataset.merge_by_lineid()
if fleet is not None: if fleet is not None:
if not isinstance(fleet, PSLib): if hasattr(fleet, "barrier_worker"):
fleet.barrier_worker() fleet.barrier_worker()
else: else:
fleet._role_maker.barrier_worker() fleet._role_maker.barrier_worker()
...@@ -1026,9 +1026,8 @@ class InMemoryDataset(DatasetBase): ...@@ -1026,9 +1026,8 @@ class InMemoryDataset(DatasetBase):
local_data_size = np.array([local_data_size]) local_data_size = np.array([local_data_size])
print('global shuffle local_data_size: ', local_data_size) print('global shuffle local_data_size: ', local_data_size)
if fleet is not None: if fleet is not None:
from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
global_data_size = local_data_size * 0 global_data_size = local_data_size * 0
if not isinstance(fleet, PSLib): if hasattr(fleet, "util"):
global_data_size = fleet.util.all_reduce(local_data_size) global_data_size = fleet.util.all_reduce(local_data_size)
else: else:
fleet._role_maker.all_reduce_worker(local_data_size, fleet._role_maker.all_reduce_worker(local_data_size,
......
...@@ -99,6 +99,7 @@ class Hogwild(DeviceWorker): ...@@ -99,6 +99,7 @@ class Hogwild(DeviceWorker):
dense_table_set = set() dense_table_set = set()
program_id = str(id(self._program)) program_id = str(id(self._program))
print("device worker program id:", program_id)
if self._program == None: if self._program == None:
print("program of current device worker is not configured") print("program of current device worker is not configured")
exit(-1) exit(-1)
...@@ -115,15 +116,20 @@ class Hogwild(DeviceWorker): ...@@ -115,15 +116,20 @@ class Hogwild(DeviceWorker):
from paddle.fluid.incubate.fleet.parameter_server import version from paddle.fluid.incubate.fleet.parameter_server import version
if version.is_transpiler() and "fleet_desc" not in opt_info: if version.is_transpiler(
) and "fleet_desc" not in opt_info and "program_configs" not in opt_info:
return return
program_configs = opt_info["program_configs"] program_configs = opt_info["program_configs"]
print("device worker program_configs:", program_configs)
for pid in program_configs: for pid in program_configs:
print("device worker", pid, program_id)
if pid == program_id: if pid == program_id:
pc = downpour.program_config.add() pc = downpour.program_config.add()
pc.program_id = program_id pc.program_id = program_id
print("device worker pull dense:",
program_configs[program_id]["pull_dense"])
for i in program_configs[program_id]["push_sparse"]: for i in program_configs[program_id]["push_sparse"]:
pc.push_sparse_table_id.extend([i]) pc.push_sparse_table_id.extend([i])
for i in program_configs[program_id]["push_dense"]: for i in program_configs[program_id]["push_dense"]:
...@@ -139,50 +145,189 @@ class Hogwild(DeviceWorker): ...@@ -139,50 +145,189 @@ class Hogwild(DeviceWorker):
trainer_desc.device_worker_name = "HogwildWorker" trainer_desc.device_worker_name = "HogwildWorker"
pull_thread = trainer_desc.pull_dense_param pull_thread = trainer_desc.pull_dense_param
pull_thread.device_num = trainer_desc.thread_num pull_thread.device_num = trainer_desc.thread_num
if opt_info.get("program_id_to_worker") is None: if opt_info.get("program_id_to_worker") is None and opt_info.get(
raise ValueError("opt_info must have program_id_to_worker") "dense_table_config") is None:
prog_id_to_worker = opt_info["program_id_to_worker"] raise ValueError(
if prog_id_to_worker.get(program_id) is None: "opt_info must have program_id_to_worker or dense_table_config")
raise ValueError("%s not found in program_id_to_worker" % if opt_info.get("program_id_to_worker") is not None:
program_id) prog_id_to_worker = opt_info["program_id_to_worker"]
worker = opt_info["program_id_to_worker"][program_id] if prog_id_to_worker.get(program_id) is None:
for i in worker.get_desc().dense_table: raise ValueError("%s not found in program_id_to_worker" %
if i.table_id in dense_table_set: program_id)
worker = opt_info["program_id_to_worker"][program_id]
for i in worker.get_desc().dense_table:
if i.table_id in dense_table_set:
dense_table = pull_thread.dense_table.add()
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.table_id = \
i.table_id
sparse_len = len(worker.get_desc().sparse_table)
for i in range(sparse_len):
sparse_table = downpour.sparse_table.add()
sparse_table.table_id = worker.get_desc().sparse_table[
i].table_id
sparse_table.sparse_key_name.extend(worker.get_desc()
.sparse_table[i].slot_key)
sparse_table.sparse_value_name.extend(worker.get_desc(
).sparse_table[i].slot_value)
sparse_table.sparse_grad_name.extend(worker.get_desc(
).sparse_table[i].slot_gradient)
sparse_table.fea_dim = \
self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
i].accessor.fea_dim
# not use emb_dim
sparse_table.emb_dim = -1
# not use hard code click
sparse_table.label_var_name = ""
for i in worker.get_desc().dense_table:
if i.table_id in dense_table_set:
dense_table = downpour.dense_table.add()
dense_table.table_id = i.table_id
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.dense_grad_name.extend(
i.dense_gradient_variable_name)
hogwild.skip_ops.extend(worker.get_desc().skip_op)
else:
dense_table_config = opt_info.get("dense_table_config")
print("device worker dense_table_config:", dense_table_config)
for table_id, varnames in dense_table_config.items():
dense_table = pull_thread.dense_table.add() dense_table = pull_thread.dense_table.add()
dense_table.dense_value_name.extend(i.dense_variable_name) dense_table.dense_value_name.extend(varnames)
dense_table.table_id = \ dense_table.table_id = table_id
i.table_id
sparse_len = len(worker.get_desc().sparse_table)
for i in range(sparse_len):
sparse_table = downpour.sparse_table.add()
sparse_table.table_id = worker.get_desc().sparse_table[i].table_id
sparse_table.sparse_key_name.extend(worker.get_desc().sparse_table[
i].slot_key)
sparse_table.sparse_value_name.extend(worker.get_desc()
.sparse_table[i].slot_value)
sparse_table.sparse_grad_name.extend(worker.get_desc().sparse_table[
i].slot_gradient)
sparse_table.fea_dim = \
self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
i].accessor.fea_dim
# not use emb_dim
sparse_table.emb_dim = -1
# not use hard code click
sparse_table.label_var_name = ""
for i in worker.get_desc().dense_table:
if i.table_id in dense_table_set:
dense_table = downpour.dense_table.add()
dense_table.table_id = i.table_id
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.dense_grad_name.extend(
i.dense_gradient_variable_name)
hogwild.skip_ops.extend(worker.get_desc().skip_op)
if self._infer: if self._infer:
hogwild.skip_ops.extend( hogwild.skip_ops.extend(
["push_sparse", "push_sparse_v2", "push_dense"]) ["push_sparse", "push_sparse_v2", "push_dense"])
class DownpourLite(DeviceWorker):
"""
DownpourLite is a kind of SGD algorithm.
"""
def __init__(self):
"""Init."""
super(DownpourLite, self).__init__()
def _gen_worker_desc(self, trainer_desc):
"""
Generator worker desc, which device worker is DownpourLiteWorker.
Args:
trainer_desc(TrainerDesc): a TrainerDesc object
"""
print("create DownpourLiteWorker")
trainer_desc.device_worker_name = "DownpourLiteWorker"
if self._infer:
# just ignore feed op for inference model
trainer_desc.downpour_param.skip_ops.extend([
"feed", "push_sparse", "push_sparse_v2", "push_dense",
"distributed_push_sparse", "send"
])
dense_table_set = set()
program_id = str(id(self._program))
print("device worker program id:", program_id)
if self._program == None:
print("program of current device worker is not configured")
exit(-1)
opt_info = self._program._fleet_opt
# when opt_info is None or empty dict, it should return
if not opt_info:
return
downpour = trainer_desc.downpour_param
if opt_info["stat_var_names"]:
for i in opt_info["stat_var_names"]:
downpour.stat_var_names.extend([i])
from paddle.fluid.incubate.fleet.parameter_server import version
if version.is_transpiler(
) and "fleet_desc" not in opt_info and "program_configs" not in opt_info:
return
program_configs = opt_info["program_configs"]
print("device worker program_configs:", program_configs)
for pid in program_configs:
print("device worker", pid, program_id)
if pid == program_id:
pc = downpour.program_config.add()
pc.program_id = program_id
print("device worker pull dense:",
program_configs[program_id]["pull_dense"])
for i in program_configs[program_id]["push_sparse"]:
pc.push_sparse_table_id.extend([i])
for i in program_configs[program_id]["push_dense"]:
pc.push_dense_table_id.extend([i])
dense_table_set.add(i)
for i in program_configs[program_id]["pull_sparse"]:
pc.pull_sparse_table_id.extend([i])
for i in program_configs[program_id]["pull_dense"]:
pc.pull_dense_table_id.extend([i])
dense_table_set.add(i)
break
pull_thread = trainer_desc.pull_dense_param
pull_thread.device_num = trainer_desc.thread_num
if opt_info.get("program_id_to_worker") is None and opt_info.get(
"dense_table_config") is None:
raise ValueError(
"opt_info must have program_id_to_worker or dense_table_config")
if opt_info.get("program_id_to_worker") is not None:
prog_id_to_worker = opt_info["program_id_to_worker"]
if prog_id_to_worker.get(program_id) is None:
raise ValueError("%s not found in program_id_to_worker" %
program_id)
worker = opt_info["program_id_to_worker"][program_id]
for i in worker.get_desc().dense_table:
if i.table_id in dense_table_set:
dense_table = pull_thread.dense_table.add()
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.table_id = \
i.table_id
sparse_len = len(worker.get_desc().sparse_table)
for i in range(sparse_len):
sparse_table = downpour.sparse_table.add()
sparse_table.table_id = worker.get_desc().sparse_table[
i].table_id
sparse_table.sparse_key_name.extend(worker.get_desc()
.sparse_table[i].slot_key)
sparse_table.sparse_value_name.extend(worker.get_desc(
).sparse_table[i].slot_value)
sparse_table.sparse_grad_name.extend(worker.get_desc(
).sparse_table[i].slot_gradient)
sparse_table.fea_dim = \
self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
i].accessor.fea_dim
# not use emb_dim
sparse_table.emb_dim = -1
# not use hard code click
sparse_table.label_var_name = ""
for i in worker.get_desc().dense_table:
if i.table_id in dense_table_set:
dense_table = downpour.dense_table.add()
dense_table.table_id = i.table_id
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.dense_grad_name.extend(
i.dense_gradient_variable_name)
downpour.skip_ops.extend(worker.get_desc().skip_op)
else:
dense_table_config = opt_info.get("dense_table_config")
print("device worker dense_table_config:", dense_table_config)
for table_id, varnames in dense_table_config.items():
dense_table = pull_thread.dense_table.add()
dense_table.dense_value_name.extend(varnames)
dense_table.table_id = table_id
if self._infer:
downpour.skip_ops.extend(
["push_sparse", "push_sparse_v2", "push_dense"])
class DownpourSGD(DeviceWorker): class DownpourSGD(DeviceWorker):
""" """
DownpourSGD is a kind of distributed SGD algorithm. DownpourSGD is a kind of distributed SGD algorithm.
......
...@@ -57,8 +57,8 @@ class TestPsTrainerPass(PsPassTestBase): ...@@ -57,8 +57,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir']) remove_path_if_exists(self.config['log_dir'])
self.ps_launch() self.ps_launch()
file1 = '/ps_log/async_run_minimize_debug:_0_worker_main.prototxt' file1 = './ps_log/async_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/async_run_minimize_debug:_1_worker_main.prototxt' file2 = './ps_log/async_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2): if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_cpu_async passed!') logger.info('test_ps_optimizer_minimize_cpu_async passed!')
else: else:
...@@ -79,8 +79,8 @@ class TestPsTrainerPass(PsPassTestBase): ...@@ -79,8 +79,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir']) remove_path_if_exists(self.config['log_dir'])
self.ps_launch() self.ps_launch()
''' '''
file1 = '/ps_log/sync_run_minimize_debug:_0_worker_main.prototxt' file1 = './ps_log/sync_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/sync_run_minimize_debug:_1_worker_main.prototxt' file2 = './ps_log/sync_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2): if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_cpu_sync passed!') logger.info('test_ps_optimizer_minimize_cpu_sync passed!')
else: else:
...@@ -102,8 +102,8 @@ class TestPsTrainerPass(PsPassTestBase): ...@@ -102,8 +102,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir']) remove_path_if_exists(self.config['log_dir'])
self.ps_launch() self.ps_launch()
file1 = '/ps_log/geo_run_minimize_debug:_0_worker_main.prototxt' file1 = './ps_log/geo_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/geo_run_minimize_debug:_1_worker_main.prototxt' file2 = './ps_log/geo_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2): if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_cpu_geo passed!') logger.info('test_ps_optimizer_minimize_cpu_geo passed!')
else: else:
...@@ -130,10 +130,10 @@ class TestPsTrainerPass(PsPassTestBase): ...@@ -130,10 +130,10 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir']) remove_path_if_exists(self.config['log_dir'])
self.ps_launch('heter-ps') self.ps_launch('heter-ps')
''' '''
file1 = '/ps_log/heter_run_minimize_debug:_0_worker_main.prototxt' file1 = './ps_log/heter_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/heter_run_minimize_debug:_1_worker_main.prototxt' file2 = './ps_log/heter_run_minimize_debug:_1_worker_main.prototxt'
file3 = '/ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt' file3 = './ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt'
file4 = '/ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt' file4 = './ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt'
if self.check(file1, file2) and self.check(file3, file4): if self.check(file1, file2) and self.check(file3, file4):
logger.info('test_ps_optimizer_minimize_heter passed!') logger.info('test_ps_optimizer_minimize_heter passed!')
else: else:
...@@ -155,8 +155,8 @@ class TestPsTrainerPass(PsPassTestBase): ...@@ -155,8 +155,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir']) remove_path_if_exists(self.config['log_dir'])
self.ps_launch("gpu-ps") self.ps_launch("gpu-ps")
file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt' file1 = './ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt' file2 = './ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2): if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_gpu passed!') logger.info('test_ps_optimizer_minimize_gpu passed!')
else: else:
...@@ -180,8 +180,8 @@ class TestPsTrainerPass(PsPassTestBase): ...@@ -180,8 +180,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists(self.config['log_dir']) remove_path_if_exists(self.config['log_dir'])
self.ps_launch("cpu-ps") self.ps_launch("cpu-ps")
file1 = '/ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt' file1 = './ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt'
file2 = '/ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt' file2 = './ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt'
if self.check(file1, file2): if self.check(file1, file2):
logger.info('test_append_send_ops_pass passed!') logger.info('test_append_send_ops_pass passed!')
else: else:
...@@ -192,5 +192,5 @@ class TestPsTrainerPass(PsPassTestBase): ...@@ -192,5 +192,5 @@ class TestPsTrainerPass(PsPassTestBase):
if __name__ == '__main__': if __name__ == '__main__':
remove_path_if_exists('/ps_log') remove_path_if_exists('./ps_log')
unittest.main() unittest.main()
...@@ -50,11 +50,11 @@ class MKLDNNBF16ActivationOp(object): ...@@ -50,11 +50,11 @@ class MKLDNNBF16ActivationOp(object):
self.dtype = np.uint16 self.dtype = np.uint16
self.init_data() self.init_data()
self.config() self.config()
self.set_attrs()
self.out = self.op_forward(self.x) self.out = self.op_forward(self.x)
self.inputs = {'X': convert_float_to_uint16(self.x)} self.inputs = {'X': convert_float_to_uint16(self.x)}
self.outputs = {'Out': self.out} self.outputs = {'Out': self.out}
self.set_attrs()
def calculate_grads(self): def calculate_grads(self):
self.dx = self.op_grad(self.out, self.x) self.dx = self.op_grad(self.out, self.x)
...@@ -162,5 +162,110 @@ class TestMKLDNNMishBF16Op(MKLDNNBF16ActivationOp, TestActivation): ...@@ -162,5 +162,110 @@ class TestMKLDNNMishBF16Op(MKLDNNBF16ActivationOp, TestActivation):
return dout * ((np.exp(x) * omega) / delta**2) return dout * ((np.exp(x) * omega) / delta**2)
class TestMKLDNNRelu6BF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "relu6"
def op_forward(self, x):
return np.clip(x, 0, 6)
def op_grad(self, dout, x):
return np.where((x > 0) & (x <= 6), dout, 0)
class TestMKLDNNLeakyReluBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "leaky_relu"
def op_forward(self, x):
return np.where(x > 0, x, self.alpha * x)
def op_grad(self, dout, x):
return np.where(x > 0, dout, self.alpha * dout)
def set_attrs(self):
self.alpha = 0.2
self.attrs = {"use_mkldnn": True, "alpha": self.alpha}
class TestMKLDNNSwishBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "swish"
def expit(self, val):
return 1 / (1 + np.exp(-self.beta * val))
def op_forward(self, x):
return x * self.expit(x)
def op_grad(self, dout, x):
return dout * self.expit(x) * (1 + self.beta * x * (1 - self.expit(x)))
def set_attrs(self):
self.beta = 0.2
self.attrs = {"use_mkldnn": True, "beta": self.beta}
class TestMKLDNNHardSwishBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "hard_swish"
def op_forward(self, x):
result = np.where(x < -3, 0, x)
return np.where(result > 3, result, result * (result + 3) / 6)
def op_grad(self, dout, x):
result = np.where(x < -3, 0, x)
return np.where(result > 3, dout, dout * (2 * x + 3) / 6)
class TestMKLDNNTanhBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "tanh"
def op_forward(self, x):
return np.tanh(x)
def op_grad(self, dout, x):
return dout * (1 - np.tanh(x)**2)
class TestMKLDNNAbsBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "abs"
def op_forward(self, x):
return np.absolute(x)
def op_grad(self, dout, x):
return dout * np.sign(x)
class TestMKLDNNEluBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "elu"
def op_forward(self, x):
return np.where(x > 0, x, self.alpha * (np.exp(x) - 1))
def op_grad(self, dout, x):
return np.where(x > 0, dout, dout * self.alpha * np.exp(x))
def set_attrs(self):
self.alpha = 0.2
self.attrs = {"use_mkldnn": True, "alpha": self.alpha}
class TestMKLDNNExpBF16Op(MKLDNNBF16ActivationOp, TestActivation):
def config(self):
self.op_type = "exp"
def op_forward(self, x):
return np.exp(x)
def op_grad(self, dout, x):
return dout * np.exp(x)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
...@@ -30,23 +30,32 @@ def ref_softplus(x, beta, threshold): ...@@ -30,23 +30,32 @@ def ref_softplus(x, beta, threshold):
return out return out
@OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)), @OpTestTool.skip_if_not_cpu_bf16()
"GPU is not supported")
class TestSoftplusOneDNNOp(OpTest): class TestSoftplusOneDNNOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "softplus" self.op_type = "softplus"
self.beta = 1 self.beta = 1
self.threshold = 20 self.threshold = 20
self.config() self.config()
self.set_dtype()
self.attrs = {'use_mkldnn': True, 'beta': self.beta} self.attrs = {'use_mkldnn': True, 'beta': self.beta}
self.inputs = {'X': np.random.random(self.x_shape).astype(np.float32)} self.x = np.random.random(self.x_shape)
self.out = ref_softplus(self.x, self.beta, self.threshold)
if self.dtype != np.float32:
self.x = convert_float_to_uint16(self.x)
self.inputs = {'X': self.out}
self.outputs = { self.outputs = {
'Out': ref_softplus(self.inputs['X'], self.beta, self.threshold) 'Out': ref_softplus(self.out, self.beta, self.threshold)
} }
def config(self): def config(self):
self.x_shape = (10, 10) self.x_shape = (10, 10)
def set_dtype(self):
self.dtype = np.float32
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
...@@ -73,6 +82,27 @@ class TestSoftplus3DExtendedFunctorOneDNNOp(TestSoftplusOneDNNOp): ...@@ -73,6 +82,27 @@ class TestSoftplus3DExtendedFunctorOneDNNOp(TestSoftplusOneDNNOp):
self.beta = 0.4 self.beta = 0.4
class TestSoftplusBF16OneDNNOp(TestSoftplusOneDNNOp):
def set_dtype(self):
self.dtype = np.uint16
class TestSoftplus4DBF16OneDNNOp(TestSoftplus4DOneDNNOp):
def set_dtype(self):
self.dtype = np.uint16
class TestSoftplus6DBF16OneDNNOp(TestSoftplus6DOneDNNOp):
def set_dtype(self):
self.dtype = np.uint16
class TestSoftplus3DExtendedFunctorBF16OneDNNOp(
TestSoftplus3DExtendedFunctorOneDNNOp):
def set_dtype(self):
self.dtype = np.uint16
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static() paddle.enable_static()
unittest.main() unittest.main()
...@@ -26,7 +26,7 @@ import paddle ...@@ -26,7 +26,7 @@ import paddle
from paddle.fluid.tests.unittests.distributed_passes.ps_pass_test_base import * from paddle.fluid.tests.unittests.distributed_passes.ps_pass_test_base import *
from paddle.distributed.ps.utils.public import logger, ps_log_root_dir from paddle.distributed.ps.utils.public import logger, ps_log_root_dir
from ps_dnn_trainer import DnnTrainer from ps_dnn_trainer import DnnTrainer
from paddle.distributed.fleet.proto import ps_pb2 import paddle.distributed.fleet.proto.the_one_ps_pb2 as ps_pb2
from google.protobuf import text_format from google.protobuf import text_format
......
...@@ -12,9 +12,10 @@ ...@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest import unittest
import paddle import paddle
import os
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
import time import time
......
...@@ -11,10 +11,10 @@ ...@@ -11,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest import unittest
import paddle import paddle
import os
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
import time import time
......
...@@ -11,10 +11,10 @@ ...@@ -11,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest import unittest
import paddle import paddle
import os
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
import time import time
......
...@@ -309,7 +309,7 @@ class TestFleetBase(unittest.TestCase): ...@@ -309,7 +309,7 @@ class TestFleetBase(unittest.TestCase):
(tr1_proc, tr1_out, tr1_err, tr1_out_log, tr1_err_log)) (tr1_proc, tr1_out, tr1_err, tr1_out_log, tr1_err_log))
def _run_cluster(self, model, envs): def _run_cluster(self, model, envs):
env = {'GRAD_CLIP': str(self._grad_clip_mode)} env = {'GRAD_CLIP': str(self._grad_clip_mode), 'WITH_DISTRIBUTE': 'ON'}
python_path = self._python_interp python_path = self._python_interp
gloo_path = tempfile.mkdtemp() gloo_path = tempfile.mkdtemp()
...@@ -343,7 +343,8 @@ class TestFleetBase(unittest.TestCase): ...@@ -343,7 +343,8 @@ class TestFleetBase(unittest.TestCase):
tr1_proc, tr1_out, tr1_err, tr1_out_log, tr1_err_log = tr1 tr1_proc, tr1_out, tr1_err, tr1_out_log, tr1_err_log = tr1
# Wait until trainer process terminate # Wait until trainer process terminate
time_out = 120 #time_out = 120
time_out = 60
cur_time = 0 cur_time = 0
while True: while True:
......
...@@ -51,8 +51,9 @@ class TestDistMnistAsyncInMemoryDataset2x2(TestFleetBase): ...@@ -51,8 +51,9 @@ class TestDistMnistAsyncInMemoryDataset2x2(TestFleetBase):
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
def test_dist_train(self): def test_dist_train(self):
self.check_with_place( # self.check_with_place(
"dist_fleet_ctr.py", delta=1e-5, check_error_log=False) # "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print('recover later')
class TestDistMnistAsync2x2(TestFleetBase): class TestDistMnistAsync2x2(TestFleetBase):
...@@ -85,8 +86,9 @@ class TestDistMnistAsync2x2(TestFleetBase): ...@@ -85,8 +86,9 @@ class TestDistMnistAsync2x2(TestFleetBase):
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
def test_dist_train(self): def test_dist_train(self):
self.check_with_place( # self.check_with_place(
"dist_fleet_ctr.py", delta=1e-5, check_error_log=False) # "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print('recover later')
class TestDistCtrHalfAsync2x2(TestFleetBase): class TestDistCtrHalfAsync2x2(TestFleetBase):
...@@ -122,8 +124,9 @@ class TestDistCtrHalfAsync2x2(TestFleetBase): ...@@ -122,8 +124,9 @@ class TestDistCtrHalfAsync2x2(TestFleetBase):
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
def test_dist_train(self): def test_dist_train(self):
self.check_with_place( # self.check_with_place(
"dist_fleet_ctr.py", delta=1e-5, check_error_log=False) # "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print('recover later')
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -52,8 +52,9 @@ class TestDistMnistSync2x2(TestFleetBase): ...@@ -52,8 +52,9 @@ class TestDistMnistSync2x2(TestFleetBase):
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
def test_dist_train(self): def test_dist_train(self):
self.check_with_place( # self.check_with_place(
"dist_fleet_ctr.py", delta=1e-5, check_error_log=False) # "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print('recover later')
# @unittest.skip(reason="Skip unstable ut, reader need to be rewrite") # @unittest.skip(reason="Skip unstable ut, reader need to be rewrite")
...@@ -91,8 +92,9 @@ class TestDistMnistAsyncDataset2x2(TestFleetBase): ...@@ -91,8 +92,9 @@ class TestDistMnistAsyncDataset2x2(TestFleetBase):
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
def test_dist_train(self): def test_dist_train(self):
self.check_with_place( # self.check_with_place(
"dist_fleet_ctr.py", delta=1e-5, check_error_log=False) # "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print('recover later')
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from __future__ import print_function from __future__ import print_function
import os import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest import unittest
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
......
...@@ -13,14 +13,14 @@ ...@@ -13,14 +13,14 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
import unittest import unittest
import paddle import paddle
import os
paddle.enable_static() paddle.enable_static()
# For Net # For Net
...@@ -74,11 +74,12 @@ class TestExponentialDecay(unittest.TestCase): ...@@ -74,11 +74,12 @@ class TestExponentialDecay(unittest.TestCase):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True strategy.a_sync = True
optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(loss) optimizer.minimize([loss])
fleet.init_server() fleet.init_server()
if __name__ == '__main__': if __name__ == '__main__':
os.environ["GLOG_v"] = "4" os.environ["GLOG_v"] = "4"
os.environ["GLOG_logtostderr"] = "1" os.environ["GLOG_logtostderr"] = "1"
unittest.main() unittest.main()
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
from __future__ import print_function from __future__ import print_function
import os import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest import unittest
import tempfile import tempfile
import shutil import shutil
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
from __future__ import print_function from __future__ import print_function
import os import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest import unittest
import tempfile import tempfile
import shutil import shutil
......
...@@ -13,10 +13,12 @@ ...@@ -13,10 +13,12 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
import paddle.fluid as fluid import paddle.fluid as fluid
import os
import unittest import unittest
import paddle import paddle
paddle.enable_static() paddle.enable_static()
......
...@@ -13,10 +13,11 @@ ...@@ -13,10 +13,11 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
import paddle.fluid as fluid import paddle.fluid as fluid
import os
import unittest import unittest
import paddle import paddle
paddle.enable_static() paddle.enable_static()
......
...@@ -13,10 +13,11 @@ ...@@ -13,10 +13,11 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker
import paddle.fluid as fluid import paddle.fluid as fluid
import os
import unittest import unittest
import paddle import paddle
paddle.enable_static() paddle.enable_static()
......
...@@ -23,7 +23,7 @@ local_logger = get_logger( ...@@ -23,7 +23,7 @@ local_logger = get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer, HeterXpuTrainer, PSGPUTrainer, HeterPipelineTrainer from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer, HeterXpuTrainer, PSGPUTrainer, HeterPipelineTrainer
from .device_worker import Hogwild, DownpourSGD, Section, DownpourSGDOPT, HeterSection from .device_worker import Hogwild, DownpourSGD, DownpourLite, Section, DownpourSGDOPT, HeterSection
from .framework import Variable from .framework import Variable
from multiprocessing import Process, Manager from multiprocessing import Process, Manager
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册