“d649dbf442bd7ba4ce63a2a4e479a27c8d40ca8d”上不存在“paddle/fluid/operators/sequence_ops/sequence_softmax_op.h”
提交 862dde5e 编写于 作者: Z zlsh80826

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into trt_stack_op

......@@ -118,7 +118,7 @@ function(op_library TARGET)
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op")
"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
......
......@@ -27,6 +27,7 @@ add_subdirectory(fleet)
add_subdirectory(io)
#ddim lib
proto_library(framework_proto SRCS framework.proto)
proto_library(heter_service_proto SRCS heter_service.proto)
proto_library(data_feed_proto SRCS data_feed.proto)
proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
data_feed_proto)
......@@ -195,20 +196,37 @@ cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc o
if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper box_wrapper lodtensor_printer
device_context scope framework_proto trainer_desc_proto glog fs shell
fleet_wrapper heter_wrapper box_wrapper lodtensor_printer
lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS}
graph_to_program_pass variable_helper data_feed_proto timer monitor)
graph_to_program_pass variable_helper data_feed_proto timer monitor
heter_service_proto)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
elseif(WITH_PSLIB)
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor pslib_brpc )
# TODO: Fix these unittest failed on Windows
if(NOT WIN32)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
else()
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper box_wrapper lodtensor_printer feed_fetch_method
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor)
# TODO: Fix these unittest failed on Windows
if(NOT WIN32)
......
......@@ -27,6 +27,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
......@@ -51,10 +52,23 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
class FleetWrapper;
#ifdef PADDLE_WITH_PSLIB
class HeterWrapper;
#endif
class PullDenseWorker {
public:
virtual ~PullDenseWorker() {}
virtual void Initialize(const TrainerDesc& param);
#ifdef PADDLE_WITH_CUDA
void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
void AddPlace(const paddle::platform::Place place) {
places_.push_back(place);
}
void AddThreadScope(Scope* scope) { thread_scopes_.push_back(scope); }
#endif
int Start();
void Stop();
void SetRootScope(Scope* scope) { root_scope_ = scope; }
......@@ -62,6 +76,7 @@ class PullDenseWorker {
void ResetThreadVersion(uint64_t table_id);
void Wait(std::vector<::std::future<int32_t>>* status_vec);
void PullDense(bool force_update = false);
void CreatePinVar();
int GetThreadIdByScope(const Scope* scope);
void SetThreadIdByScope(const Scope* scope, int tid);
static std::shared_ptr<PullDenseWorker> GetInstance() {
......@@ -105,6 +120,12 @@ class PullDenseWorker {
std::mutex mutex_for_mean_scale_;
float total_batch_num_ = 0;
std::unordered_map<const Scope*, int> scope_to_thread_id_;
#ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_;
std::vector<paddle::platform::Place> places_;
std::vector<Scope*> thread_scopes_;
#endif
};
// should incorporate different type of device
......@@ -126,6 +147,8 @@ class DeviceWorker {
virtual void BindingDataFeedMemory() = 0;
virtual void SetRootScope(Scope* root_scope);
virtual void SetDataFeed(DataFeed* data_feed);
virtual void SetWorkerNum(int num) {}
virtual void CacheProgram(const ProgramDesc& main_program) {}
virtual void SetNeedDumpField(bool need_dump_field) {
need_dump_field_ = need_dump_field;
}
......@@ -161,6 +184,7 @@ class DeviceWorker {
FetchConfig fetch_config_;
bool use_cvm_;
bool no_cvm_;
TrainerDesc trainer_desc_;
// dump params or grads for debug
bool need_dump_param_;
......@@ -306,6 +330,87 @@ class DownpourWorkerOpt : public DownpourWorker {
uint64_t async_tid_ = 0;
};
#ifdef PADDLE_WITH_PSLIB
class HeterCpuWorker : public HogwildWorker {
public:
HeterCpuWorker() {}
virtual ~HeterCpuWorker() {}
virtual void Initialize(const TrainerDesc& desc);
virtual void TrainFiles();
virtual void TrainFilesWithProfiler();
virtual void SetNeedDump(bool need_dump_field);
virtual void SetChannelWriter(ChannelObject<std::string>* queue);
virtual void SetWorkerNum(int num) { worker_num_ = num; }
virtual void Schedule(int taskid);
virtual void JumpContext(std::shared_ptr<HeterTask> task);
virtual void CacheProgram(const ProgramDesc& main_program) {
new (&program_) ProgramDesc(main_program);
}
virtual void GetXpuOpIndex();
protected:
std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
void FillSparseValue(std::shared_ptr<HeterTask> task, size_t table_id);
void PushGradients();
void CollectLabelInfo(std::shared_ptr<HeterTask> task, size_t table_id);
void AdjustInsWeight(std::shared_ptr<HeterTask> task);
void DumpParam();
void CopySparseTable();
void CopyDenseTable();
void CopyDenseVars();
private:
int mpi_rank_;
int worker_num_;
int xpu_begin_op_index_;
int xpu_end_op_index_;
ProgramDesc program_;
HeterObjectPool<HeterTask> object_pool_;
HeterList<int, std::shared_ptr<HeterTask>> run_queue_;
HeterList<int, std::shared_ptr<HeterTask>> wait_queue_;
bool need_dump_param_;
std::vector<std::string> dump_param_;
bool need_to_push_dense_;
bool need_dump_field_;
bool dump_slot_;
bool need_to_push_sparse_;
std::vector<std::string> dump_fields_;
ChannelWriter<std::string> writer_;
DownpourWorkerParameter param_;
float scale_datanorm_;
// just save the value in param_ for easy access
std::map<uint64_t, std::string> label_var_name_;
std::map<uint64_t, std::vector<std::string>> sparse_key_names_;
std::map<uint64_t, std::vector<std::string>> sparse_value_names_;
std::map<uint64_t, std::vector<std::string>> sparse_grad_names_;
std::map<uint64_t, std::vector<std::string>> dense_value_names_;
std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
platform::Place root_place_;
// actually pushed feasign of each table
std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
// skipped ops
std::vector<std::string> skip_ops_;
std::vector<::std::future<int32_t>> push_sparse_status_;
std::vector<::std::future<int32_t>> push_dense_status_;
// adjust ins weight
AdjustInsWeightConfig adjust_ins_weight_config_;
std::vector<float> nid_show_;
// check nan and inf during training
std::vector<std::string> check_nan_var_names_;
// copy table
CopyTableConfig copy_table_config_;
std::map<uint64_t, uint64_t> table_dependency_;
std::vector<std::pair<uint64_t, uint64_t>> copy_sparse_tables_;
std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
};
#endif
#if defined(PADDLE_WITH_NCCL)
class SectionWorker : public DeviceWorker {
public:
......
......@@ -62,6 +62,9 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker(
REGISTER_DEVICE_WORKER_CLASS(HogwildWorker);
REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
#ifdef PADDLE_WITH_PSLIB
REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
#endif
#if defined(PADDLE_WITH_NCCL)
REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
#endif
......
......@@ -35,7 +35,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
dump_file_num_ = trainer_desc.dump_file_num();
const std::vector<paddle::framework::DataFeed *> readers =
dataset->GetReaders();
RegisterHeterCallback();
thread_num_ = readers.size();
workers_.resize(thread_num_);
for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
......@@ -55,6 +55,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
workers_[i]->SetDumpParamVector(dump_param_);
workers_[i]->InitRandomDumpConfig(trainer_desc);
workers_[i]->Initialize(trainer_desc);
workers_[i]->SetWorkerNum(thread_num_);
}
VLOG(3) << "going to initialize pull dense worker";
......@@ -64,6 +65,13 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
SetDebug(trainer_desc.debug());
}
void DistMultiTrainer::RegisterHeterCallback() {
auto fleet_ptr = FleetWrapper::GetInstance();
fleet_ptr->RegisterHeterCallback([this](int worker, int taskid) {
// workers_[worker]->Schedule(taskid);
});
}
void DistMultiTrainer::InitDumpEnv() {
queue_ = paddle::framework::MakeChannel<std::string>();
for (int i = 0; i < thread_num_; ++i) {
......@@ -90,6 +98,9 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program,
workers_[i]->SetRootScope(root_scope_);
workers_[i]->CreateDeviceResource(main_program); // Program
workers_[i]->BindingDataFeedMemory();
#ifdef PADDLE_WITH_PSLIB
workers_[i]->CacheProgram(main_program);
#endif
}
// Scope* -> thread id, it will be used in push_dense op
for (int i = 0; i < thread_num_; ++i) {
......@@ -104,6 +115,11 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) {
}
pull_dense_worker_->SetRootScope(root_scope_);
pull_dense_worker_->Start();
#ifdef PADDLE_WITH_PSLIB
for (int i = 0; i < thread_num_; ++i) {
workers_[i]->GetXpuOpIndex();
}
#endif
VLOG(3) << "init other env done.";
}
......
......@@ -379,7 +379,7 @@ void DownpourWorker::CopyDenseTable() {
pull_dense_status.resize(0);
fleet_ptr_->PullDenseVarsAsync(*root_scope_, dest_table,
dense_value_names_[dest_table],
&pull_dense_status);
&pull_dense_status, true);
for (auto& t : pull_dense_status) {
t.wait();
auto status = t.get();
......
......@@ -19,4 +19,6 @@ else()
cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
endif(WITH_GLOO)
cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto)
cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
......@@ -154,6 +154,219 @@ void FleetWrapper::CreateClient2ClientConnection() {
#endif
}
#ifdef PADDLE_WITH_PSLIB
void FleetWrapper::HeterPullSparseVars(
int workerid, std::shared_ptr<HeterTask> task, const uint64_t table_id,
const std::vector<std::string>& var_names, int fea_value_dim,
const std::vector<std::string>& var_emb_names) {
std::vector<::std::future<int32_t>> pull_sparse_status;
pull_sparse_status.resize(0);
auto& scope = *(task->scope_);
auto& fea_keys = (task->features_)[table_id];
auto& fea_values = (task->feature_values_)[table_id];
fea_keys.clear();
for (size_t var_index = 0; var_index < var_names.size(); ++var_index) {
const std::string& name = var_names[var_index];
Variable* var = scope.FindVar(name);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
int64_t* ids = tensor->data<int64_t>();
size_t len = tensor->numel();
// skip slots which do not have embedding
const std::string& emb_name = var_emb_names[var_index];
Variable* emb_var = scope.FindVar(emb_name);
if (emb_var == nullptr) {
continue;
}
for (auto i = 0u; i < len; ++i) {
if (ids[i] == 0u) {
continue;
}
fea_keys.push_back(static_cast<uint64_t>(ids[i]));
}
}
fea_values.resize(fea_keys.size() + 1);
for (auto& t : fea_values) {
t.resize(fea_value_dim);
}
std::vector<float*> pull_result_ptr;
for (auto& t : fea_values) {
pull_result_ptr.push_back(t.data());
}
auto status = pslib_ptr_->_worker_ptr->heter_pull_sparse(
workerid, pull_result_ptr.data(), table_id, fea_keys.data(),
fea_keys.size(), task->taskid_);
pull_sparse_status.push_back(std::move(status));
for (auto& t : pull_sparse_status) {
t.wait();
auto status = t.get();
if (status != 0) {
LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
sleep(sleep_seconds_before_fail_exit_);
exit(-1);
}
}
}
void FleetWrapper::HeterPushSparseVars(
std::shared_ptr<HeterTask> task, const uint64_t table_id,
const std::vector<std::string>& sparse_key_names,
const std::vector<std::string>& sparse_grad_names, const int emb_dim,
std::vector<::std::future<int32_t>>* push_sparse_status, const bool use_cvm,
const bool dump_slot, const bool no_cvm) {
auto& scope = *(task->scope_);
int batch_size = task->cur_batch_;
int offset = 2;
int slot_offset = 0;
int grad_dim = emb_dim;
int show_index = 0;
int click_index = 1;
auto& fea_keys = (task->features_)[table_id];
auto& fea_labels = (task->feature_labels_)[table_id];
auto& push_values = (task->feature_grads_)[table_id];
auto& sparse_push_keys = (task->sparse_push_keys_)[table_id];
if (use_cvm) {
offset = 0;
grad_dim = emb_dim - 2;
}
if (no_cvm) {
offset = 0;
grad_dim = emb_dim;
}
if (dump_slot) {
slot_offset = 1;
show_index = 1;
click_index = 2;
}
CHECK_GE(grad_dim, 0);
sparse_push_keys.clear();
sparse_push_keys.reserve(fea_keys.size() + 1);
push_values.resize(fea_keys.size() + 1);
for (auto& t : push_values) {
t.resize(emb_dim + offset + slot_offset);
}
uint64_t fea_idx = 0u;
for (size_t i = 0;
i < sparse_key_names.size() && i < sparse_grad_names.size(); ++i) {
Variable* var = scope.FindVar(sparse_key_names[i]);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (tensor == nullptr) {
LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null";
exit(-1);
}
size_t len = tensor->numel();
int64_t* ids = tensor->data<int64_t>();
int slot = 0;
if (dump_slot) {
slot = boost::lexical_cast<int>(sparse_key_names[i]);
}
Variable* g_var = scope.FindVar(sparse_grad_names[i]);
if (g_var == nullptr) {
continue;
}
LoDTensor* g_tensor = g_var->GetMutable<LoDTensor>();
if (g_tensor == nullptr) {
LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null";
exit(-1);
}
float* g = g_tensor->data<float>();
if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
int dim = emb_dim + offset;
Eigen::Map<
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
g_mat(g, g_tensor->numel() / dim, dim);
g_mat.rightCols(grad_dim) *= batch_size;
}
for (auto id_idx = 0u; id_idx < len; ++id_idx) {
if (ids[id_idx] == 0) {
g += emb_dim;
continue;
}
sparse_push_keys.push_back(ids[id_idx]);
CHECK(fea_idx < push_values.size());
if (use_cvm || no_cvm) {
memcpy(push_values[fea_idx].data() + offset + slot_offset, g,
sizeof(float) * emb_dim);
} else {
CHECK(fea_idx < fea_labels.size());
memcpy(push_values[fea_idx].data() + offset + slot_offset, g,
sizeof(float) * emb_dim);
push_values[fea_idx][show_index] = 1.0f;
push_values[fea_idx][click_index] =
static_cast<float>(fea_labels[fea_idx]);
}
if (dump_slot) {
push_values[fea_idx][0] = static_cast<float>(slot);
}
g += emb_dim;
fea_idx++;
}
}
// slots whose embedding has been stop gradient or
// not involved in forward-backward
uint64_t no_grad_fea_num = 0u;
for (size_t i = sparse_grad_names.size(); i < sparse_key_names.size(); ++i) {
Variable* var = scope.FindVar(sparse_key_names[i]);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (tensor == nullptr) {
LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null";
exit(-1);
}
size_t len = tensor->numel();
int64_t* ids = tensor->data<int64_t>();
for (auto id_idx = 0u; id_idx < len; ++id_idx) {
if (ids[id_idx] == 0) {
continue;
}
++no_grad_fea_num;
}
}
CHECK(fea_idx + no_grad_fea_num == fea_keys.size())
<< "fea_idx: " << fea_idx << " no_grad_fea_num: " << no_grad_fea_num
<< " features size: " << fea_keys.size();
CHECK(fea_idx == sparse_push_keys.size());
if (fea_idx == 0) {
return;
}
std::vector<float*> push_g_vec;
for (auto i = 0u; i < sparse_push_keys.size(); ++i) {
push_g_vec.push_back(push_values[i].data());
}
auto status = pslib_ptr_->_worker_ptr->push_sparse(
table_id, sparse_push_keys.data(), (const float**)push_g_vec.data(),
sparse_push_keys.size());
push_sparse_status->push_back(std::move(status));
}
#endif
int FleetWrapper::RegisterHeterCallback(HeterCallBackFunc handler) {
#ifdef PADDLE_WITH_PSLIB
VLOG(3) << "calling FleetWrapper::RegisterHeterCallback";
VLOG(3) << "pslib_ptr_=" << pslib_ptr_;
VLOG(3) << "_worker_ptr=" << pslib_ptr_->_worker_ptr;
return pslib_ptr_->_worker_ptr->registe_heter_callback(handler);
#else
VLOG(0) << "FleetWrapper::RegisterHeterCallback"
<< " does nothing when no pslib";
#endif
return 0;
}
void FleetWrapper::PullSparseToLocal(const uint64_t table_id,
int fea_value_dim) {
#ifdef PADDLE_WITH_PSLIB
......@@ -421,13 +634,17 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
void FleetWrapper::PullDenseVarsAsync(
const Scope& scope, const uint64_t tid,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* pull_dense_status) {
std::vector<::std::future<int32_t>>* pull_dense_status, bool in_cpu) {
#ifdef PADDLE_WITH_PSLIB
auto& regions = _regions[tid];
regions.clear();
regions.resize(var_names.size());
for (auto i = 0u; i < var_names.size(); ++i) {
Variable* var = scope.FindVar(var_names[i]);
std::string varname = var_names[i];
if (!in_cpu) {
varname = var_names[i] + "pin";
}
Variable* var = scope.FindVar(varname);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* w = tensor->data<float>();
paddle::ps::Region reg(w, tensor->numel());
......@@ -485,6 +702,57 @@ void FleetWrapper::PushDenseVarsSync(
Scope* scope, const uint64_t table_id,
const std::vector<std::string>& var_names) {}
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size, const paddle::platform::Place& place,
cudaStream_t stream, cudaEvent_t event) {
std::vector<paddle::ps::Region> regions;
for (auto& t : var_names) {
Variable* var = scope.FindVar(t);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int count = tensor->numel();
float* g_data = tensor->data<float>();
Variable* pin_var = scope.FindVar(t + "pin");
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_g = pin_tensor->mutable_data<float>(tensor->dims(),
platform::CUDAPinnedPlace());
memory::Copy(platform::CUDAPinnedPlace(), pin_g,
BOOST_GET_CONST(platform::CUDAPlace, place), g_data,
sizeof(float) * count, stream);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
cudaEventSynchronize(event);
float* g = pin_g;
if (scale_datanorm >= 0) {
if (t.find(".batch_size@GRAD") != std::string::npos ||
t.find(".batch_sum@GRAD") != std::string::npos) {
Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
float scale = 1.0 / batch_size;
mat *= scale;
} else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
VLOG(3) << "epsilon: " << scale_datanorm;
for (int i = 0; i < count; ++i) {
g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
batch_size * scale_datanorm;
}
}
}
paddle::ps::Region reg(g, count);
regions.emplace_back(std::move(reg));
}
auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
regions.size(), table_id);
if (push_sparse_status) {
push_sparse_status->push_back(std::move(status));
}
}
#endif
void FleetWrapper::PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
......@@ -1085,8 +1353,8 @@ void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope,
push_status.wait();
auto status = push_status.get();
if (status != 0) {
PADDLE_THORW(platform::errors::Fatal(
"push shrink dense param failed, status is [%d].", status));
// PADDLE_THORW(platform::errors::Fatal(
// "push shrink dense param failed, status is [%d].", status));
sleep(sleep_seconds_before_fail_exit_);
exit(-1);
}
......
......@@ -28,6 +28,7 @@ limitations under the License. */
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
......@@ -80,6 +81,24 @@ class FleetWrapper {
pull_local_thread_num_ = thread_num;
}
#ifdef PADDLE_WITH_PSLIB
void HeterPullSparseVars(int workerid, std::shared_ptr<HeterTask> task,
const uint64_t table_id,
const std::vector<std::string>& var_names,
int fea_dim,
const std::vector<std::string>& var_emb_names);
void HeterPushSparseVars(
std::shared_ptr<HeterTask> task, const uint64_t table_id,
const std::vector<std::string>& sparse_key_names,
const std::vector<std::string>& sparse_grad_names, const int emb_dim,
std::vector<::std::future<int32_t>>* push_sparse_status,
const bool use_cvm, const bool dump_slot, const bool no_cvm);
#endif
typedef std::function<void(int, int)> HeterCallBackFunc;
int RegisterHeterCallback(HeterCallBackFunc handler);
// Pull sparse variables from server in sync mode
// Param<in>: scope, table_id, var_names, fea_keys, fea_dim, var_emb_names
// Param<out>: fea_values
......@@ -118,15 +137,24 @@ class FleetWrapper {
void PullDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* pull_dense_status);
std::vector<::std::future<int32_t>>* pull_dense_status, bool in_cpu);
// push dense parameters(not gradients) to server in sync mode
void PushDenseParamSync(const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names);
// Push dense variables to server in async mode
// Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
// Param<out>: push_sparse_status
// Push dense variables to server in async mode
// Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
// Param<out>: push_sparse_status
#ifdef PADDLE_WITH_CUDA
void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
std::vector<::std::future<int32_t>>* push_sparse_status,
float scale_datanorm, int batch_size,
const paddle::platform::Place& place, cudaStream_t stream,
cudaEvent_t event);
#endif
void PushDenseVarsAsync(
const Scope& scope, const uint64_t table_id,
const std::vector<std::string>& var_names,
......
......@@ -54,10 +54,10 @@ void HdfsStore::set(const std::string& key, const std::vector<char>& data) {
paddle::framework::fs_remove(tmp);
if (i == retry_times_) {
VLOG(0) << "fs_open_write failed, retry times reaches limit";
PADDLE_THROW(platform::errors::PreconditionNotMet(
"fs_open_write failed, retry times reaches"
" limit ",
retry_times_));
// PADDLE_THROW(platform::errors::PreconditionNotMet(
// "fs_open_write failed, retry times reaches"
// " limit ",
// retry_times_));
}
} else {
break;
......@@ -143,9 +143,9 @@ void HdfsStore::wait(const std::vector<std::string>& keys,
break;
}
}
PADDLE_THROW(platform::errors::ExecutionTimeout(
"TIMEOUT self_rank = %d pair_rank = %d", self_rank_,
last_check_rank));
// PADDLE_THROW(platform::errors::ExecutionTimeout(
VLOG(0) << "TIMEOUT self_rank = " << self_rank_
<< " pair_rank = " << last_check_rank;
}
std::this_thread::sleep_for(std::chrono::milliseconds(wait_sleep_ms_));
}
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/timer.h"
#ifdef PADDLE_WITH_PSLIB
namespace paddle {
namespace framework {
std::shared_ptr<HeterWrapper> HeterWrapper::s_instance_ = NULL;
bool HeterWrapper::is_initialized_ = false;
void HeterWrapper::CreateClient2XpuConnection() {
brpc::ChannelOptions options;
options.protocol = "baidu_std";
options.connection_type = "single";
options.timeout_ms = 2000000;
xpu_channels_.resize(xpu_list_.size());
for (size_t i = 0; i < xpu_list_.size(); ++i) {
VLOG(3) << "channel init: " << xpu_list_[i];
xpu_channels_[i].reset(new brpc::Channel());
if (xpu_channels_[i]->Init(xpu_list_[i].c_str(), "", &options) != 0) {
VLOG(0) << "server channel init fail";
}
}
}
void HeterWrapper::RegisterServiceHandler(int cmd, HeterServiceHandler func) {
service_.RegisterServiceHandler(cmd, func);
}
void HeterWrapper::SetXpuList(const std::vector<std::string>& xpu_list) {
#ifdef PADDLE_WITH_PSLIB
VLOG(3) << "Going to set xpu list";
for (auto& x : xpu_list) {
xpu_list_.push_back(x);
VLOG(3) << "set xpu list: " << x << " size: " << xpu_list_.size();
}
#endif
}
void HeterWrapper::StartXpuService(const std::string& ip, uint32_t port) {
std::string ip_port = ip + ":" + std::to_string(port);
VLOG(3) << "xpu server starts at " << ip_port;
server_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE);
brpc::ServerOptions options;
if (server_.Start(ip_port.c_str(), &options) != 0) {
VLOG(0) << "xpu server start fail";
}
}
// void HeterWrapper::SerializeToReq(const std::string& varname,
// Scope* scope, HeterRequest& request) {
// auto* req_var = request.mutable_vars();
void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
VariableMessage* req_var) {
Variable* var = scope->FindVar(varname);
if (var == nullptr) {
return;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
req_var->set_varname(varname);
req_var->set_type(LOD_TENSOR);
req_var->set_data_type(static_cast<VariableMessage::Type>(tensor->type()));
for (auto& dim : framework::vectorize(tensor->dims())) {
req_var->add_dims(dim);
}
const framework::LoD lod = tensor->lod();
if (lod.size() > 0) {
req_var->set_lod_level(lod.size());
for (auto& each : lod) {
VariableMessage::LodData* lod_inner = req_var->add_lod();
for (auto& d : each) {
lod_inner->add_lod_data(d);
}
}
}
auto* req_data = req_var->mutable_data();
req_data->clear();
req_data->resize(tensor->numel() * SizeOfType(tensor->type()));
char* data_ptr = const_cast<char*>(req_data->data());
if (platform::is_cpu_place(tensor->place())) {
memcpy(data_ptr, tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()));
}
#ifdef PADDLE_WITH_CUDA
else {
memory::Copy(platform::CPUPlace(), data_ptr,
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
tensor->data<void>(),
tensor->numel() * SizeOfType(tensor->type()), nullptr);
}
#endif
}
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
#ifdef PADDLE_WITH_CUDA
void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var,
platform::Place place,
cudaStream_t stream) {
#else
void HeterWrapper::DeSerializeToTensor(Scope* scope,
const VariableMessage& req_var,
platform::Place place) {
#endif
// const VariableMessage& req_var = request->vars();
auto* var = scope->FindVar(req_var.varname());
auto* tensor = var->GetMutable<LoDTensor>();
std::vector<int> vec_dim;
for (auto& x : req_var.dims()) {
vec_dim.push_back(x);
}
tensor->Resize(make_ddim(vec_dim));
LoD lod;
for (int i = 0; i < req_var.lod_level(); ++i) {
framework::Vector<size_t> v;
for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
v.push_back(req_var.lod(i).lod_data(j));
}
lod.push_back(v);
}
tensor->set_lod(lod);
void* tensor_data =
tensor->mutable_data(place, ToVarType(req_var.data_type()));
#ifdef PADDLE_WITH_CUDA
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
platform::CPUPlace(), req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()), stream);
#else
memcpy(tensor_data, req_var.data().data(),
tensor->numel() * SizeOfType(tensor->type()));
#endif
}
framework::proto::VarType::Type HeterWrapper::ToVarType(
VariableMessage::Type type) {
switch (type) {
case VariableMessage::FP32:
return framework::proto::VarType::FP32; // NOLINT
case VariableMessage::FP64:
return framework::proto::VarType::FP64; // NOLINT
case VariableMessage::INT32:
return framework::proto::VarType::INT32; // NOLINT
case VariableMessage::INT64:
return framework::proto::VarType::INT64; // NOLINT
case VariableMessage::BOOL:
return framework::proto::VarType::BOOL; // NOLINT
default:
VLOG(0) << "Not support type " << type;
}
}
void HeterWrapper::StopXpuService(int num) {
HeterRequest request;
HeterResponse response;
brpc::Controller cntl;
request.set_cmd(2);
// for (size_t i = 0; i < xpu_channels_.size(); ++i) {
HeterService_Stub stub(xpu_channels_[num].get());
stub.service(&cntl, &request, &response, NULL);
if (cntl.Failed()) {
VLOG(0) << "call stop xpu service fail: " << cntl.ErrorText();
} else {
VLOG(3) << "call stop xpu service success";
}
// }
}
void HeterWrapper::EndPass(Scope* scope, int num) {
HeterRequest request;
HeterResponse response;
brpc::Controller cntl;
request.set_cmd(1);
// for (size_t i = 0; i < xpu_channels_.size(); ++i) {
HeterService_Stub stub(xpu_channels_[num].get());
stub.service(&cntl, &request, &response, NULL);
if (cntl.Failed()) {
VLOG(0) << "call end pass fail: " << cntl.ErrorText();
} else {
VLOG(3) << "call end pass success";
for (int j = 0; j < response.vars_size(); ++j) {
DeSerializeToTensor(scope, response.vars(j), platform::CPUPlace());
}
}
// }
}
void HeterWrapper::CallRemoteXpu(std::shared_ptr<HeterTask> task,
HeterCpuWorker* worker, int mpi_rank,
std::vector<std::string>& send_vars) {
HeterRequest request;
request.set_cmd(0);
request.set_cur_batch(task->cur_batch_);
OnHeterRpcDone* done = new OnHeterRpcDone([this, task, worker](void* done) {
auto* closure = (OnHeterRpcDone*)done;
if (closure->cntl.Failed()) {
VLOG(0) << "call xpu fail: " << closure->cntl.ErrorText();
} else {
VLOG(3) << "call xpu success";
}
// DeSerializeToTensor(task->scope_,
// closure->response.vars(), platform::CPUPlace());
for (int i = 0; i < closure->response.vars_size(); ++i) {
DeSerializeToTensor(task->scope_, closure->response.vars(i),
platform::CPUPlace());
}
worker->Schedule(task->taskid_);
});
// std::vector<std::string> varnames = {"click", "12345"};
// //varnames.push_back(send_var);
// //if (send_var == "_generated_var_412") {
// varnames.push_back("filter_by_instag_0.tmp_0");
// varnames.push_back("filter_by_instag_2.tmp_0");
// varnames.push_back("filter_by_instag_0.tmp_1");
// varnames.push_back("concat_1.tmp_0");
// }
for (auto& varname : send_vars) {
auto* req_var = request.add_vars();
SerializeToReq(varname, task->scope_, req_var);
}
int num = mpi_rank % xpu_channels_.size();
HeterService_Stub stub(xpu_channels_[num].get());
// stub.service(&cntl, &request, &response,
// brpc::NewCallback(&HeterWrapper::RpcCallBack,
// response, cntl, worker, task));
stub.service(&done->cntl, &request, &done->response, done);
}
void HeterWrapper::CallRemoteXpuSync(std::shared_ptr<HeterTask> task,
HeterCpuWorker* worker, int mpi_rank,
std::vector<std::string>& send_vars) {
HeterRequest request;
HeterResponse response;
brpc::Controller cntl;
request.set_cmd(0);
request.set_cur_batch(task->cur_batch_);
// std::vector<std::string> varnames = {"concat_1.tmp_0", "click", "12345"};
for (auto& varname : send_vars) {
auto* req_var = request.add_vars();
SerializeToReq(varname, task->scope_, req_var);
}
HeterService_Stub stub(xpu_channels_[0].get());
stub.service(&cntl, &request, &response, NULL);
if (cntl.Failed()) {
VLOG(0) << "call xpu fail: " << cntl.ErrorText();
} else {
VLOG(3) << "call xpu success";
for (int i = 0; i < response.vars_size(); ++i) {
DeSerializeToTensor(task->scope_, response.vars(i), platform::CPUPlace());
}
}
}
} // end namespace framework
} // end namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <ctime>
#include <map>
#include <memory>
#include <random>
#include <string>
#include <unordered_map>
#include <vector>
#ifdef PADDLE_WITH_PSLIB
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
namespace framework {
class HeterCpuWorker;
typedef std::function<void(void*)> HeterRpcCallbackFunc;
class OnHeterRpcDone : public google::protobuf::Closure {
public:
OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {}
virtual ~OnHeterRpcDone() {}
void Run() {
std::unique_ptr<OnHeterRpcDone> self_guard(this);
handler_(this);
}
HeterRpcCallbackFunc handler_;
HeterResponse response;
brpc::Controller cntl;
};
class HeterWrapper {
public:
virtual ~HeterWrapper() {
server_.Stop(1000);
server_.Join();
}
HeterWrapper() {}
static void HeterRpcCallBack(HeterResponse* response, brpc::Controller* cntl,
HeterCpuWorker* worker,
std::shared_ptr<HeterTask> task);
void CreateClient2XpuConnection();
void RegisterServiceHandler(int cmd, HeterServiceHandler func);
void StartXpuService(const std::string& ip, uint32_t port);
void CallRemoteXpu(std::shared_ptr<HeterTask> task, HeterCpuWorker* worker,
int mpi_rank, std::vector<std::string>& send_vars);
void CallRemoteXpuSync(std::shared_ptr<HeterTask> task,
HeterCpuWorker* worker, int mpi_rank,
std::vector<std::string>& send_vars);
void StopXpuService(int num);
void EndPass(Scope* scope, int num);
void SerializeToReq(const std::string& varname, Scope* scope,
VariableMessage* req_var);
framework::proto::VarType::Type ToVarType(VariableMessage::Type type);
#ifdef PADDLE_WITH_CUDA
void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
platform::Place place,
cudaStream_t stream = nullptr);
#else
void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
platform::Place place);
#endif
// HeterWrapper singleton
static std::shared_ptr<HeterWrapper> GetInstance() {
if (NULL == s_instance_) {
s_instance_.reset(new paddle::framework::HeterWrapper());
}
return s_instance_;
}
std::vector<std::string>& GetXpuList() { return xpu_list_; }
void SetXpuList(const std::vector<std::string>& xpu_list);
private:
static std::shared_ptr<HeterWrapper> s_instance_;
protected:
std::vector<std::shared_ptr<brpc::Channel>> xpu_channels_;
brpc::Server server_;
HeterXpuService service_;
static bool is_initialized_;
DISABLE_COPY_AND_ASSIGN(HeterWrapper);
std::vector<std::string> xpu_list_;
};
} // end namespace framework
} // end namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <fstream>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <thread> // NOLINT
#include <unordered_map> // NOLINT
#include <unordered_set> // NOLINT
#include <vector>
#include "paddle/fluid/framework/heter_service.pb.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_PSLIB
#include "brpc/channel.h"
#include "brpc/controller.h"
#include "brpc/server.h"
namespace paddle {
namespace framework {
typedef std::function<int(const HeterRequest*, HeterResponse*)>
HeterServiceHandler;
class DataFeed;
class HeterXpuService : public HeterService {
public:
HeterXpuService() {}
virtual ~HeterXpuService() {}
void service(::google::protobuf::RpcController* controller,
const HeterRequest* request, HeterResponse* response,
::google::protobuf::Closure* done) {
brpc::ClosureGuard done_guard(done);
int ret = 0;
int cmd = request->cmd();
auto itr = handler_map_.find(cmd);
if (itr == handler_map_.end()) {
} else {
ret = itr->second(request, response);
}
// response->set_err_code(0);
// response->set_err_msg("");
if (ret != 0) {
// response->set_err_code(-1);
// response->set_err_msg("xpu service error");
}
}
void RegisterServiceHandler(int cmd, HeterServiceHandler func) {
VLOG(0) << "register heter service";
handler_map_[cmd] = func;
}
private:
std::unordered_map<int, HeterServiceHandler> handler_map_;
};
enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE };
class HeterTask {
public:
void Update() {
if (state_ == PULL_SPARSE) {
state_ = OP_RUN;
} else if (state_ == OP_RUN) {
state_ = XPU;
// state_ = PUSH_GRAD;
// state_ = PUSH_GRAD;
} else if (state_ == XPU) {
state_ = OP_RUN_END;
} else if (state_ == OP_RUN_END) {
state_ = PUSH_GRAD;
} else if (state_ == PUSH_GRAD) {
state_ = DONE;
}
}
void Reset() {
total_time = 0;
read_time = 0;
pack_time = 0;
pull_sparse_local_time = 0;
op_all_time = 0;
xpu_op_time = 0;
xpu_wait_time = 0;
cpu_op_time = 0;
collect_label_time = 0;
fill_sparse_time = 0;
push_sparse_time = 0;
}
void Show() {
std::cout << "features size " << features_.size() << std::endl;
for (size_t i = 0; i < features_.size(); ++i) {
std::cout << "features[" << i << "] size " << features_[i].size()
<< std::endl;
}
}
void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch,
const ProgramDesc& program);
Scope* scope_{nullptr};
int taskid_;
int cur_batch_;
HeterTaskState state_;
// cache
std::map<uint64_t, std::vector<uint64_t>> features_;
std::map<uint64_t, std::vector<float>> feature_labels_;
std::map<uint64_t, std::vector<std::vector<float>>> feature_values_;
std::map<uint64_t, std::vector<std::vector<float>>> feature_grads_;
std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
double total_time{0};
double read_time{0};
double pack_time{0};
double pull_sparse_local_time{0};
double op_all_time{0};
double xpu_op_time{0};
double xpu_wait_time{0};
double cpu_op_time{0};
double collect_label_time{0};
double fill_sparse_time{0};
double push_sparse_time{0};
};
template <class T>
class HeterObjectPool {
public:
HeterObjectPool() {}
virtual ~HeterObjectPool(){};
std::shared_ptr<T> Get() {
std::lock_guard<std::mutex> lock(mutex_);
if (pool_.empty()) {
num_ += 1;
#ifdef PADDLE_WITH_CUDA
VLOG(0) << "pool construct size: " << num_;
#endif
return std::make_shared<T>();
} else {
auto ret = pool_.back();
pool_.pop_back();
return ret;
}
}
void Push(std::shared_ptr<T> data) {
std::lock_guard<std::mutex> lock(mutex_);
pool_.push_back(std::move(data));
}
int Size() {
std::lock_guard<std::mutex> lock(mutex_);
return pool_.size();
}
std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
private:
std::vector<std::shared_ptr<T>> pool_;
std::mutex mutex_;
int num_{0};
};
struct BthreadMutextGuard {
BthreadMutextGuard(bthread_mutex_t* rho) {
mutex_ = rho;
bthread_mutex_lock(mutex_);
}
~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); }
bthread_mutex_t* mutex_;
};
template <class T>
class BtObjectPool {
public:
BtObjectPool() {
bthread_mutex_init(&mutex_, NULL);
bthread_cond_init(&cond_, NULL);
}
virtual ~BtObjectPool() {
bthread_cond_destroy(&cond_);
bthread_mutex_destroy(&mutex_);
};
std::shared_ptr<T> Get() {
BthreadMutextGuard guard(&mutex_);
while (pool_.empty()) {
bthread_cond_wait(&cond_, &mutex_);
}
auto ret = pool_.back();
pool_.pop_back();
return ret;
}
void Push(std::shared_ptr<T> data) {
BthreadMutextGuard guard(&mutex_);
pool_.push_back(std::move(data));
bthread_cond_signal(&cond_);
}
int Size() { return pool_.size(); }
std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
private:
std::vector<std::shared_ptr<T>> pool_;
bthread_mutex_t mutex_;
bthread_cond_t cond_;
int num_{0};
};
template <class K, class T>
struct HeterNode {
K key;
T value;
HeterNode* prev;
HeterNode* next;
};
template <class K, class T>
class HeterList {
public:
HeterList() : head_(new HeterNode<K, T>), tail_(new HeterNode<K, T>) {
head_->prev = NULL;
head_->next = tail_;
tail_->prev = head_;
tail_->next = NULL;
size = 0;
cap_ = 1e9;
}
~HeterList() {
delete head_;
delete tail_;
}
void SetCap(int num) { cap_ = num; }
bool TryPut(K& key, T& value) {
std::unique_lock<std::mutex> lock(mutex_);
cond_.wait(lock, [this] { return size < cap_; });
if (task_map_.find(key) != task_map_.end()) {
// std::cout << "try put key=" << key << " false" << std::endl;
task_map_.erase(key);
return false;
} else {
HeterNode<K, T>* node = new HeterNode<K, T>;
node->key = key;
node->value = value;
map_[node->key] = node;
attach(node);
// std::cout << "try put key=" << key << " true" << std::endl;
return true;
}
}
bool Put(K& key, T& value) {
std::unique_lock<std::mutex> lock(mutex_);
cond_.wait(lock, [this] { return size < cap_; });
HeterNode<K, T>* node = new HeterNode<K, T>;
// std::cout << "put key=" << key << " true" << std::endl;
node->key = key;
node->value = value;
map_[node->key] = node;
attach(node);
return true;
}
T TryGet(const K& key) {
std::lock_guard<std::mutex> lock(mutex_);
auto iter = map_.find(key);
if (iter != map_.end()) {
// std::cout << "try get key=" << key << " true" << std::endl;
HeterNode<K, T>* node = iter->second;
detach(node);
cond_.notify_one();
T ret = std::move(node->value);
map_.erase(key);
delete node;
return ret;
}
task_map_.insert(key);
// std::cout << "try get key=" << key << " false" << std::endl;
return nullptr;
}
T Get(const K& key) {
std::lock_guard<std::mutex> lock(mutex_);
auto iter = map_.find(key);
if (iter != map_.end()) {
// std::cout << "get key=" << key << " true" << std::endl;
HeterNode<K, T>* node = iter->second;
detach(node);
cond_.notify_one();
T ret = std::move(node->value);
map_.erase(key);
delete node;
return ret;
}
// std::cout << "get key=" << key << " false" << std::endl;
return nullptr;
}
T Get() {
std::lock_guard<std::mutex> lock(mutex_);
HeterNode<K, T>* node = head_->next;
if (node == tail_) {
// std::cout << "get2 false" << std::endl;
return nullptr;
} else {
detach(node);
cond_.notify_one();
T ret = std::move(node->value);
map_.erase(node->key);
// std::cout << "get2 key=" << node->key << " true" << std::endl;
delete node;
return ret;
}
}
bool Empty() {
std::lock_guard<std::mutex> lock(mutex_);
return head_->next == tail_;
}
int Size() {
std::lock_guard<std::mutex> lock(mutex_);
return size;
}
private:
void detach(HeterNode<K, T>* node) {
node->prev->next = node->next;
node->next->prev = node->prev;
size--;
}
void attach(HeterNode<K, T>* node) {
node->prev = head_;
node->next = head_->next;
head_->next->prev = node;
head_->next = node;
size++;
}
private:
HeterNode<K, T>* head_;
HeterNode<K, T>* tail_;
std::unordered_map<K, HeterNode<K, T>*> map_;
std::unordered_set<K> task_map_;
std::mutex mutex_;
std::condition_variable cond_;
int cap_;
int size;
};
} // namespace framework
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
package paddle.framework;
option cc_generic_services = true;
// It can be: LoDTensor、SelectedRows or NCCL_ID
enum VarType {
LOD_TENSOR = 0;
SELECTED_ROWS = 1;
NCCL_ID = 2;
}
// VariableMessage is serialized paddle variable message.
// NOTICE(gongwb):don't modify this proto if you are not
// not familar with how we serialize in sendrecvop_utils.h
// and deserilize it in variable_response.h.
message VariableMessage {
enum Type {
// Pod Types
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LodData { repeated int64 lod_data = 1; }
optional string varname = 1;
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType
optional VarType type = 2;
// bool persistable is not needed for sending.
// tensor info:
optional Type data_type = 3;
repeated int64 dims = 4;
// lod details:
optional int64 lod_level = 5;
repeated LodData lod = 6;
// selected_rows height, aka. original dim0
optional int64 slr_height = 7;
// tensor data
optional bytes data = 8;
}
message HeterRequest {
required int32 cmd = 1;
optional int32 cur_batch = 2;
repeated VariableMessage vars = 3;
};
message HeterResponse {
// optional VariableMessage vars = 1;
repeated VariableMessage vars = 1;
};
service HeterService { rpc service(HeterRequest) returns (HeterResponse); };
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/string/string_helper.h"
#ifdef PADDLE_WITH_PSLIB
#if defined _WIN32 || defined __APPLE__
#else
#define _LINUX
#endif
namespace paddle {
namespace framework {
void HeterTask::PackTask(Scope* thread_scope, int taskid, DataFeed* reader,
int cur_batch, const ProgramDesc& program) {
// total_time = 0;
// read_time = 0;
// pack_time = 0;
// pull_sparse_local_time = 0;
taskid_ = taskid;
auto& block = program.Block(0);
if (!scope_) {
scope_ = &(thread_scope->NewScope());
for (auto& var : block.AllVars()) {
if (!var->Persistable()) {
auto* ptr = scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
}
}
}
state_ = PULL_SPARSE;
cur_batch_ = cur_batch;
auto& use_slots = reader->GetUseSlotAlias();
for (size_t i = 0; i < use_slots.size(); ++i) {
Variable* thread_var = thread_scope->FindVar(use_slots[i]);
LoDTensor* thread_tensor = thread_var->GetMutable<LoDTensor>();
Variable* task_var = scope_->FindVar(use_slots[i]);
LoDTensor* task_tensor = task_var->GetMutable<LoDTensor>();
TensorCopy(*thread_tensor, platform::CPUPlace(), task_tensor);
auto& tensor_lod = thread_tensor->lod()[0];
LoD thread_lod{tensor_lod};
task_tensor->set_lod(thread_lod);
}
}
void HeterCpuWorker::GetXpuOpIndex() {
xpu_begin_op_index_ = trainer_desc_.xpu_start_idx();
xpu_end_op_index_ = trainer_desc_.xpu_end_idx();
VLOG(0) << "xpu begin: " << xpu_begin_op_index_
<< " xpu end: " << xpu_end_op_index_;
// CHECK(xpu_begin_op_index_ == trainer_desc_.xpu_start_idx());
// CHECK(xpu_end_op_index_ == trainer_desc_.xpu_end_idx());
// CHECK(trainer_desc_.op_run_start_idx() == 0);
// CHECK(trainer_desc_.op_run_end_idx() == xpu_begin_op_index_ - 1);
// CHECK(trainer_desc_.op_run_end_start_idx() == xpu_end_op_index_ + 1);
// CHECK(trainer_desc_.op_run_end_end_idx() == ops_.size() - 1);
}
void HeterCpuWorker::Schedule(int taskid) {
VLOG(3) << "schedule " << taskid;
auto task = wait_queue_.TryGet(taskid);
if (task) {
run_queue_.Put(task->taskid_, task);
}
}
void HeterCpuWorker::JumpContext(std::shared_ptr<HeterTask> task) {
VLOG(3) << "jump context " << task->taskid_;
if (!(wait_queue_.TryPut(task->taskid_, task))) {
run_queue_.Put(task->taskid_, task);
}
}
void HeterCpuWorker::Initialize(const TrainerDesc& desc) {
param_ = desc.downpour_param();
mpi_rank_ = desc.mpi_rank();
trainer_desc_ = desc;
for (int i = 0; i < param_.sparse_table_size(); ++i) {
uint64_t table_id =
static_cast<uint64_t>(param_.sparse_table(i).table_id());
TableParameter table = param_.sparse_table(i);
sparse_key_names_[table_id].resize(table.sparse_key_name_size());
for (int j = 0; j < table.sparse_key_name_size(); ++j) {
sparse_key_names_[table_id][j] = table.sparse_key_name(j);
}
sparse_value_names_[table_id].resize(table.sparse_value_name_size());
for (int j = 0; j < table.sparse_value_name_size(); ++j) {
sparse_value_names_[table_id][j] = table.sparse_value_name(j);
}
sparse_grad_names_[table_id].resize(table.sparse_grad_name_size());
for (int j = 0; j < table.sparse_grad_name_size(); ++j) {
sparse_grad_names_[table_id][j] = table.sparse_grad_name(j);
}
label_var_name_[table_id] = table.label_var_name();
sparse_push_keys_[table_id] = std::vector<uint64_t>();
}
for (int i = 0; i < param_.dense_table_size(); ++i) {
uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id());
auto table = param_.dense_table(i);
dense_value_names_[table_id].resize(table.dense_value_name_size());
for (int j = 0; j < table.dense_value_name_size(); ++j) {
dense_value_names_[table_id][j] = table.dense_value_name(j);
}
dense_grad_names_[table_id].resize(table.dense_grad_name_size());
for (int j = 0; j < table.dense_grad_name_size(); ++j) {
dense_grad_names_[table_id][j] = table.dense_grad_name(j);
}
}
skip_ops_.resize(param_.skip_ops_size());
for (int i = 0; i < param_.skip_ops_size(); ++i) {
skip_ops_[i] = param_.skip_ops(i);
}
for (int i = 0; i < param_.stat_var_names_size(); ++i) {
stat_var_name_map_[param_.stat_var_names(i)] = 1;
}
need_to_push_sparse_ = param_.push_sparse();
need_to_push_dense_ = param_.push_dense();
fleet_ptr_ = FleetWrapper::GetInstance();
heter_ptr_ = HeterWrapper::GetInstance();
fetch_config_ = desc.fetch_config();
use_cvm_ = desc.use_cvm();
// for sparse value accessor, embedding only
no_cvm_ = desc.no_cvm();
scale_datanorm_ = desc.scale_datanorm();
dump_slot_ = desc.dump_slot();
dump_fields_.resize(desc.dump_fields_size());
for (int i = 0; i < desc.dump_fields_size(); ++i) {
dump_fields_[i] = desc.dump_fields(i);
}
adjust_ins_weight_config_ = desc.adjust_ins_weight_config();
need_dump_param_ = false;
dump_param_.resize(desc.dump_param_size());
for (int i = 0; i < desc.dump_param_size(); ++i) {
dump_param_[i] = desc.dump_param(i);
}
if (desc.dump_param_size() != 0) {
need_dump_param_ = true;
}
for (int i = 0; i < desc.check_nan_var_names_size(); ++i) {
check_nan_var_names_.push_back(desc.check_nan_var_names(i));
}
copy_table_config_ = desc.copy_table_config();
for (int i = 0; i < copy_table_config_.src_sparse_tables_size(); ++i) {
uint64_t src_table = copy_table_config_.src_sparse_tables(i);
uint64_t dest_table = copy_table_config_.dest_sparse_tables(i);
VLOG(3) << "copy_sparse_tables_ push back " << src_table << "->"
<< dest_table;
copy_sparse_tables_.push_back(std::make_pair(src_table, dest_table));
}
for (int i = 0; i < copy_table_config_.src_dense_tables_size(); ++i) {
uint64_t src_table = copy_table_config_.src_dense_tables(i);
uint64_t dest_table = copy_table_config_.dest_dense_tables(i);
VLOG(3) << "copy_dense_tables_ push back " << src_table << "->"
<< dest_table;
copy_dense_tables_.push_back(std::make_pair(src_table, dest_table));
}
for (auto& m : copy_table_config_.table_denpendency_map()) {
if (sparse_key_names_.find(m.key()) != sparse_key_names_.end()) {
// currently only support one dependency
for (auto& value : m.values()) {
table_dependency_[m.key()] = value;
}
}
}
}
void HeterCpuWorker::SetChannelWriter(ChannelObject<std::string>* queue) {
writer_.Reset(queue);
}
void HeterCpuWorker::SetNeedDump(bool need_dump_field) {
need_dump_field_ = need_dump_field;
}
// template <typename T>
// std::string PrintLodTensorType(LoDTensor* tensor,
// int64_t start, int64_t end) {
// auto count = tensor->numel();
// if (start < 0 || end > count) {
// VLOG(3) << "access violation";
// return "access violation";
// }
// std::ostringstream os;
// for (int64_t i = start; i < end; i++) {
// os << ":" << tensor->data<T>()[i];
// }
// return os.str();
// }
//
// std::string PrintLodTensorIntType(LoDTensor* tensor, int64_t start,
// int64_t end) {
// auto count = tensor->numel();
// if (start < 0 || end > count) {
// VLOG(3) << "access violation";
// return "access violation";
// }
// std::ostringstream os;
// for (int64_t i = start; i < end; i++) {
// os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
// }
// return os.str();
// }
//
// std::string PrintLodTensor(LoDTensor* tensor, int64_t start, int64_t end) {
// std::string out_val;
// if (tensor->type() == proto::VarType::FP32) {
// out_val = PrintLodTensorType<float>(tensor, start, end);
// } else if (tensor->type() == proto::VarType::INT64) {
// out_val = PrintLodTensorIntType(tensor, start, end);
// } else if (tensor->type() == proto::VarType::FP64) {
// out_val = PrintLodTensorType<double>(tensor, start, end);
// } else {
// out_val = "unsupported type";
// }
// return out_val;
// }
//
// std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index) {
// auto& dims = tensor->dims();
// if (tensor->lod().size() != 0) {
// auto& lod = tensor->lod()[0];
// return {lod[index] * dims[1], lod[index + 1] * dims[1]};
// } else {
// return {index * dims[1], (index + 1) * dims[1]};
// }
// }
//
// bool CheckValidOutput(LoDTensor* tensor, size_t batch_size) {
// auto& dims = tensor->dims();
// if (dims.size() != 2) return false;
// if (tensor->lod().size() != 0) {
// auto& lod = tensor->lod()[0];
// if (lod.size() != batch_size + 1) {
// return false;
// }
// } else {
// if (dims[0] != static_cast<int>(batch_size)) {
// return false;
// }
// }
// return true;
// }
void HeterCpuWorker::DumpParam() {
// std::string os;
// for (auto& param : dump_param_) {
// os.clear();
// os = param;
// Variable* var = thread_scope_->FindVar(param);
// if (var == nullptr) {
// continue;
// }
// LoDTensor* tensor = var->GetMutable<LoDTensor>();
// int64_t len = tensor->numel();
// os += PrintLodTensor(tensor, 0, len);
// writer_ << os;
// }
}
void HeterCpuWorker::CollectLabelInfo(std::shared_ptr<HeterTask> task,
size_t table_idx) {
if (no_cvm_) {
return;
}
uint64_t table_id = static_cast<uint64_t>(
param_.program_config(0).pull_sparse_table_id(table_idx));
TableParameter table;
for (auto i : param_.sparse_table()) {
if (i.table_id() == table_id) {
table = i;
break;
}
}
auto& feature = (task->features_)[table_id];
auto& feature_label = (task->feature_labels_)[table_id];
Scope* scope = task->scope_;
feature_label.resize(feature.size());
Variable* var = scope->FindVar(label_var_name_[table_id]);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
int64_t* label_ptr = tensor->data<int64_t>();
size_t global_index = 0;
for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) {
VLOG(3) << "sparse_key_names_[" << i
<< "]: " << sparse_key_names_[table_id][i];
Variable* fea_var = scope->FindVar(sparse_key_names_[table_id][i]);
if (fea_var == nullptr) {
continue;
}
LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
CHECK(tensor != nullptr) << "tensor of var "
<< sparse_key_names_[table_id][i] << " is null";
// skip slots which do not have embedding
Variable* emb_var = scope->FindVar(sparse_value_names_[table_id][i]);
if (emb_var == nullptr) {
continue;
}
int64_t* ids = tensor->data<int64_t>();
size_t fea_idx = 0;
// tensor->lod()[0].size() == batch_size + 1
for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) {
for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) {
// should be skipped feasign defined in protobuf
if (ids[fea_idx] == 0u) {
continue;
}
feature_label[global_index++] =
static_cast<float>(label_ptr[lod_idx - 1]);
}
}
}
CHECK(global_index == feature.size())
<< "expect fea info size:" << feature.size() << " real:" << global_index;
}
void HeterCpuWorker::FillSparseValue(std::shared_ptr<HeterTask> task,
size_t table_idx) {
uint64_t table_id = static_cast<uint64_t>(
param_.program_config(0).pull_sparse_table_id(table_idx));
TableParameter table;
for (auto i : param_.sparse_table()) {
if (i.table_id() == table_id) {
table = i;
break;
}
}
auto& fea_value = (task->feature_values_)[table_id];
Scope* scope = task->scope_;
auto fea_idx = 0u;
std::vector<float> init_value(table.fea_dim());
for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) {
std::string slot_name = sparse_key_names_[table_id][i];
std::string emb_slot_name = sparse_value_names_[table_id][i];
Variable* var = scope->FindVar(slot_name);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
CHECK(tensor != nullptr) << "tensor of var " << slot_name << " is null";
int64_t* ids = tensor->data<int64_t>();
int len = tensor->numel();
Variable* var_emb = scope->FindVar(emb_slot_name);
if (var_emb == nullptr) {
continue;
}
LoDTensor* tensor_emb = var_emb->GetMutable<LoDTensor>();
float* ptr =
tensor_emb->mutable_data<float>({len, table.emb_dim()}, place_);
// memset(ptr, 0, sizeof(float) * len * table.emb_dim());
auto& tensor_lod = tensor->lod()[0];
LoD data_lod{tensor_lod};
tensor_emb->set_lod(data_lod);
bool is_nid = (adjust_ins_weight_config_.need_adjust() &&
adjust_ins_weight_config_.nid_slot() == emb_slot_name);
if (is_nid) {
nid_show_.clear();
}
int nid_ins_index = 0;
for (int index = 0; index < len; ++index) {
if (use_cvm_ || no_cvm_) {
if (ids[index] == 0u) {
memcpy(ptr + table.emb_dim() * index, init_value.data(),
sizeof(float) * table.emb_dim());
if (is_nid) {
nid_show_.push_back(-1);
++nid_ins_index;
}
continue;
}
memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data(),
sizeof(float) * table.emb_dim());
if (is_nid &&
static_cast<size_t>(index) == tensor->lod()[0][nid_ins_index]) {
nid_show_.push_back(fea_value[fea_idx][0]);
++nid_ins_index;
}
fea_idx++;
} else {
if (ids[index] == 0u) {
memcpy(ptr + table.emb_dim() * index, init_value.data() + 2,
sizeof(float) * table.emb_dim());
if (is_nid) {
nid_show_.push_back(-1);
++nid_ins_index;
}
continue;
}
memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data() + 2,
sizeof(float) * table.emb_dim());
if (is_nid &&
static_cast<size_t>(index) == tensor->lod()[0][nid_ins_index]) {
nid_show_.push_back(fea_value[fea_idx][0]);
++nid_ins_index;
}
fea_idx++;
}
}
}
}
void HeterCpuWorker::AdjustInsWeight(std::shared_ptr<HeterTask> task) {
#ifdef _LINUX
// check var and tensor not null
Scope* scope = task->scope_;
if (!adjust_ins_weight_config_.need_adjust()) {
VLOG(0) << "need_adjust=false, skip adjust ins weight";
return;
}
Variable* nid_var = scope->FindVar(adjust_ins_weight_config_.nid_slot());
if (nid_var == nullptr) {
VLOG(0) << "nid slot var " << adjust_ins_weight_config_.nid_slot()
<< " is nullptr, skip adjust ins weight";
return;
}
LoDTensor* nid_tensor = nid_var->GetMutable<LoDTensor>();
if (nid_tensor == nullptr) {
VLOG(0) << "tensor of nid slot var " << adjust_ins_weight_config_.nid_slot()
<< " is nullptr, skip adjust ins weight";
return;
}
Variable* ins_weight_var =
scope->FindVar(adjust_ins_weight_config_.ins_weight_slot());
if (ins_weight_var == nullptr) {
VLOG(0) << "ins weight var " << adjust_ins_weight_config_.ins_weight_slot()
<< " is nullptr, skip adjust ins weight";
return;
}
LoDTensor* ins_weight_tensor = ins_weight_var->GetMutable<LoDTensor>();
if (ins_weight_tensor == nullptr) {
VLOG(0) << "tensor of ins weight tensor "
<< adjust_ins_weight_config_.ins_weight_slot()
<< " is nullptr, skip adjust ins weight";
return;
}
float* ins_weights = ins_weight_tensor->data<float>();
size_t len = ins_weight_tensor->numel(); // len = batch size
// here we assume nid_show slot only has one feasign in each instance
CHECK(len == nid_show_.size()) << "ins_weight size should be equal to "
<< "nid_show size, " << len << " vs "
<< nid_show_.size();
float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold();
float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio();
int64_t nid_adjw_num = 0;
double nid_adjw_weight = 0.0;
size_t ins_index = 0;
for (size_t i = 0; i < len; ++i) {
float nid_show = nid_show_[i];
VLOG(3) << "nid_show " << nid_show;
if (nid_show < 0) {
VLOG(3) << "nid_show < 0, continue";
continue;
}
float ins_weight = 1.0;
if (nid_show >= 0 && nid_show < nid_adjw_threshold) {
ins_weight = log(M_E +
(nid_adjw_threshold - nid_show) / nid_adjw_threshold *
nid_adjw_ratio);
// count nid adjw insnum and weight
++nid_adjw_num;
nid_adjw_weight += ins_weight;
// choose large ins weight
VLOG(3) << "ins weight new " << ins_weight << ", ins weight origin "
<< ins_weights[ins_index];
if (ins_weight > ins_weights[ins_index]) {
VLOG(3) << "ins " << ins_index << " weight changes to " << ins_weight;
ins_weights[ins_index] = ins_weight;
}
++ins_index;
}
}
VLOG(3) << "nid adjw info: total_adjw_num: " << nid_adjw_num
<< ", avg_adjw_weight: " << nid_adjw_weight;
#endif
}
void HeterCpuWorker::CopySparseTable() {
for (size_t i = 0; i < copy_sparse_tables_.size(); ++i) {
int64_t src_table = copy_sparse_tables_[i].first;
int64_t dest_table = copy_sparse_tables_[i].second;
int32_t feanum = 0;
if (src_table == dest_table) {
continue;
} else if (!copy_table_config_.sparse_copy_by_feasign()) {
if (feasign_set_.find(src_table) == feasign_set_.end()) {
continue;
} else if (feasign_set_[src_table].size() == 0) {
continue;
}
feanum = fleet_ptr_->CopyTable(src_table, dest_table);
} else {
std::vector<uint64_t> fea_vec(feasign_set_[src_table].begin(),
feasign_set_[src_table].end());
feanum = fleet_ptr_->CopyTableByFeasign(src_table, dest_table, fea_vec);
fea_vec.clear();
std::vector<uint64_t>().swap(fea_vec);
}
VLOG(3) << "copy feasign from table " << src_table << " to table "
<< dest_table << ", feasign num=" << feanum;
feasign_set_[src_table].clear();
std::unordered_set<uint64_t>().swap(feasign_set_[src_table]);
}
feasign_set_.clear();
}
void HeterCpuWorker::CopyDenseTable() {
if (thread_id_ != 0) {
return;
}
thread_local std::vector<std::future<int32_t>> pull_dense_status;
for (size_t i = 0; i < copy_dense_tables_.size(); ++i) {
uint64_t src_table = copy_dense_tables_[i].first;
uint64_t dest_table = copy_dense_tables_[i].second;
if (src_table == dest_table) {
continue;
}
int32_t dim = fleet_ptr_->CopyTable(src_table, dest_table);
VLOG(3) << "copy param from table " << src_table << " to table "
<< dest_table << ", dim=" << dim;
if (copy_table_config_.dense_pull_after_copy()) {
VLOG(3) << "dense pull after copy, table=" << dest_table;
pull_dense_status.resize(0);
// fleet_ptr_->PullDenseVarsAsync(*root_scope_, dest_table,
// dense_value_names_[dest_table],
// &pull_dense_status);
for (auto& t : pull_dense_status) {
t.wait();
auto status = t.get();
if (status != 0) {
LOG(WARNING) << "pull dense after copy table failed,"
<< " table=" << dest_table;
}
}
}
}
}
void HeterCpuWorker::CopyDenseVars() {
if (thread_id_ != 0) {
return;
}
for (int i = 0; i < copy_table_config_.src_var_list_size(); ++i) {
auto& src_var_name = copy_table_config_.src_var_list(i);
auto& dest_var_name = copy_table_config_.dest_var_list(i);
if (src_var_name == dest_var_name) {
continue;
}
VLOG(3) << "copy dense var from " << src_var_name << " to "
<< dest_var_name;
Variable* src_var = thread_scope_->FindVar(src_var_name);
CHECK(src_var != nullptr) << src_var_name << " not found"; // NOLINT
LoDTensor* src_tensor = src_var->GetMutable<LoDTensor>();
CHECK(src_tensor != nullptr) << src_var_name
<< " tensor is null"; // NOLINT
float* src_data = src_tensor->data<float>();
Variable* dest_var = thread_scope_->FindVar(dest_var_name);
CHECK(dest_var != nullptr) << dest_var_name << " not found"; // NOLINT
LoDTensor* dest_tensor = dest_var->GetMutable<LoDTensor>();
CHECK(dest_tensor != nullptr) << dest_var_name
<< " tensor is null"; // NOLINT
float* dest_data = dest_tensor->data<float>();
CHECK(src_tensor->numel() == dest_tensor->numel())
<< "tensor numel not equal," << src_tensor->numel() << " vs "
<< dest_tensor->numel();
for (int i = 0; i < src_tensor->numel(); i++) {
dest_data[i] = src_data[i];
}
}
}
void HeterCpuWorker::TrainFilesWithProfiler() {
VLOG(3) << "Begin to train files with profiler";
platform::SetNumThreads(1);
device_reader_->Start();
std::vector<double> op_total_time;
std::vector<std::string> op_name;
for (auto& op : ops_) {
bool need_skip = false;
for (auto t = 0u; t < skip_ops_.size(); ++t) {
if (op->Type().find(skip_ops_[t]) != std::string::npos) {
need_skip = true;
break;
}
}
if (!need_skip) {
op_name.push_back(op->Type());
}
}
VLOG(3) << "op name size: " << op_name.size();
op_total_time.resize(op_name.size());
for (size_t i = 0; i < op_total_time.size(); ++i) {
op_total_time[i] = 0.0;
}
platform::Timer timeline;
double total_time = 0.0;
double read_time = 0.0;
double pack_time = 0.0;
double pull_sparse_local_time = 0.0;
double op_all_time = 0;
double xpu_op_time = 0;
double xpu_wait_time = 0;
double cpu_op_time = 0;
double collect_label_time = 0;
double fill_sparse_time = 0;
double push_sparse_time = 0;
int batch_cnt = 0;
int done_cnt = 0;
int cur_batch;
uint64_t total_inst = 0;
wait_queue_.SetCap(1);
while (1) {
std::shared_ptr<HeterTask> task;
task = run_queue_.Get();
if (!task) {
double tmp_read_time;
timeline.Start();
cur_batch = device_reader_->Next();
timeline.Pause();
tmp_read_time = timeline.ElapsedSec();
if (cur_batch <= 0) {
if (batch_cnt == done_cnt) {
break;
} else {
continue;
}
}
batch_cnt += 1;
int taskid = batch_cnt * worker_num_ + thread_id_;
timeline.Start();
task = object_pool_.Get();
task->Reset();
task->PackTask(thread_scope_, taskid, device_reader_, cur_batch,
program_);
timeline.Pause();
task->read_time = tmp_read_time;
task->pack_time = timeline.ElapsedSec();
task->total_time = tmp_read_time + task->pack_time;
}
for (;;) {
// pull sparse here
if (task->state_ == PULL_SPARSE) {
timeline.Start();
for (int i = 0;
i < param_.program_config(0).pull_sparse_table_id_size(); ++i) {
uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).pull_sparse_table_id(i));
TableParameter table;
for (auto j : param_.sparse_table()) {
if (j.table_id() == tid) {
table = j;
break;
}
}
fleet_ptr_->HeterPullSparseVars(
thread_id_, task, tid, sparse_key_names_[tid], table.fea_dim(),
sparse_value_names_[tid]);
}
task->Update();
// JumpContext(task);
timeline.Pause();
task->pull_sparse_local_time += timeline.ElapsedSec();
task->total_time += timeline.ElapsedSec();
} else if (task->state_ == OP_RUN) {
// total_time += task->total_time;
// read_time += task->read_time;
// pack_time += task->pack_time;
// pull_sparse_local_time += task->pull_sparse_local_time;
for (int i = 0;
i < param_.program_config(0).pull_sparse_table_id_size(); ++i) {
uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).pull_sparse_table_id(i));
timeline.Start();
CollectLabelInfo(task, i);
timeline.Pause();
task->collect_label_time += timeline.ElapsedSec();
task->total_time += timeline.ElapsedSec();
timeline.Start();
FillSparseValue(task, i);
timeline.Pause();
task->fill_sparse_time += timeline.ElapsedSec();
task->total_time += timeline.ElapsedSec();
auto nid_iter = std::find(sparse_value_names_[tid].begin(),
sparse_value_names_[tid].end(),
adjust_ins_weight_config_.nid_slot());
if (nid_iter != sparse_value_names_[tid].end()) {
AdjustInsWeight(task);
}
}
VLOG(3) << "fill sparse value for all sparse table done.";
// do computation here
// int run_op_idx = 0;
timeline.Start();
for (int i = 0; i < xpu_begin_op_index_; ++i) {
auto& op = ops_[i];
bool need_skip = false;
for (auto t = 0u; t < skip_ops_.size(); ++t) {
if (op->Type().find(skip_ops_[t]) != std::string::npos) {
need_skip = true;
break;
}
}
if (!need_skip) {
// timeline.Start();
op->Run(*(task->scope_), place_);
// timeline.Pause();
// op_total_time[run_op_idx++] += timeline.ElapsedSec();
// total_time += timeline.ElapsedSec();
}
}
task->Update();
timeline.Pause();
task->cpu_op_time += timeline.ElapsedSec();
task->total_time += timeline.ElapsedSec();
} else if (task->state_ == XPU) {
timeline.Start();
VLOG(3) << "call remote xpu taskid = " << task->taskid_;
std::vector<std::string> send_var_list;
for (int i = 0; i < trainer_desc_.xpu_recv_list_size(); ++i) {
send_var_list.push_back(trainer_desc_.xpu_recv_list(i));
}
heter_ptr_->CallRemoteXpu(task, this, mpi_rank_, send_var_list);
timeline.Pause();
task->xpu_op_time += timeline.ElapsedSec();
task->total_time += timeline.ElapsedSec();
task->Update();
timeline.Start();
JumpContext(task);
timeline.Pause();
task->xpu_wait_time += timeline.ElapsedSec();
task->total_time += timeline.ElapsedSec();
break;
} else if (task->state_ == OP_RUN_END) {
timeline.Start();
for (size_t i = xpu_end_op_index_ + 1; i < ops_.size(); ++i) {
auto& op = ops_[i];
bool need_skip = false;
for (auto t = 0u; t < skip_ops_.size(); ++t) {
if (op->Type().find(skip_ops_[t]) != std::string::npos) {
need_skip = true;
break;
}
}
if (!need_skip) {
op->Run(*(task->scope_), place_);
}
}
// check inf and nan
for (std::string& var_name : check_nan_var_names_) {
Variable* var = (task->scope_)->FindVar(var_name);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (tensor == nullptr) {
continue;
}
}
task->Update();
timeline.Pause();
task->cpu_op_time += timeline.ElapsedSec();
task->total_time += timeline.ElapsedSec();
} else if (task->state_ == PUSH_GRAD) {
if (need_to_push_sparse_) {
// push gradients here
for (int i = 0;
i < param_.program_config(0).push_sparse_table_id_size(); ++i) {
uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_sparse_table_id(i));
TableParameter table;
for (auto i : param_.sparse_table()) {
if (i.table_id() == tid) {
table = i;
break;
}
}
timeline.Start();
fleet_ptr_->HeterPushSparseVars(
task, tid, sparse_key_names_[tid], sparse_grad_names_[tid],
table.emb_dim(), &push_sparse_status_, use_cvm_, dump_slot_,
no_cvm_);
timeline.Pause();
task->push_sparse_time += timeline.ElapsedSec();
task->total_time += timeline.ElapsedSec();
}
}
if (need_to_push_sparse_) {
VLOG(3) << "push sparse gradient done.";
int32_t tmp_push_sparse_wait_times = -1;
static uint32_t push_sparse_wait_times =
static_cast<uint32_t>(tmp_push_sparse_wait_times);
if (push_sparse_status_.size() >= push_sparse_wait_times) {
for (auto& t : push_sparse_status_) {
t.wait();
}
push_sparse_status_.resize(0);
}
if (tmp_push_sparse_wait_times == -1) {
push_sparse_status_.resize(0);
}
}
// thread_scope_->DropKids();
task->Update();
} else if (task->state_ == DONE) {
PrintFetchVars();
++done_cnt;
total_inst += task->cur_batch_;
object_pool_.Push(task);
total_time += task->total_time;
read_time += task->read_time;
pack_time += task->pack_time;
pull_sparse_local_time += task->pull_sparse_local_time;
op_all_time += task->op_all_time;
xpu_op_time += task->xpu_op_time;
xpu_wait_time += task->xpu_wait_time;
cpu_op_time += task->cpu_op_time;
collect_label_time += task->collect_label_time;
fill_sparse_time += task->fill_sparse_time;
push_sparse_time += task->push_sparse_time;
// ++batch_cnt;
if (thread_id_ == 0) {
// should be configured here
if (done_cnt > 0 && done_cnt % 100 == 0) {
// double op_sum_time = 0;
// std::unordered_map<std::string, double> op_to_time;
// for (size_t i = 0; i < op_total_time.size(); ++i) {
// fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
// op_name[i].c_str(), op_total_time[i] / done_cnt);
// if (op_to_time.find(op_name[i]) == op_to_time.end()) {
// op_to_time[op_name[i]] = 0.0;
// }
// op_to_time[op_name[i]] += op_total_time[i];
// op_sum_time += op_total_time[i];
// }
// for (auto& i : op_to_time) {
// fprintf(stderr, "op [%s] run total time: [%f]ms\n",
// i.first.c_str(),
// i.second / done_cnt);
// }
fprintf(stderr, "cpu op run total time: %fs\n",
cpu_op_time / done_cnt);
fprintf(stderr, "xpu op run total time: %fs\n",
xpu_op_time / done_cnt);
fprintf(stderr, "xpu wait total time: %fs\n",
xpu_wait_time / done_cnt);
fprintf(stderr, "pack task time: %fs\n", pack_time / done_cnt);
fprintf(stderr, "train total time: %fs\n", total_time / done_cnt);
fprintf(stderr, "pull sparse local time: %fs\n",
pull_sparse_local_time / done_cnt);
fprintf(stderr, "fill sparse time: %fs\n",
fill_sparse_time / done_cnt);
fprintf(stderr, "push sparse time: %fs\n",
push_sparse_time / done_cnt);
fprintf(stderr, "collect label time: %fs\n",
collect_label_time / done_cnt);
fprintf(stderr, "mean read time: %fs\n", read_time / done_cnt);
fprintf(stderr, "IO percent: %f\n", read_time / total_time * 100);
fprintf(stderr, "cpu op run percent: %f\n",
cpu_op_time / total_time * 100);
fprintf(stderr, "xpu op run percent: %f\n",
xpu_op_time / total_time * 100);
fprintf(stderr, "xpu wait percent: %f\n",
xpu_wait_time / total_time * 100);
fprintf(stderr, "pack task percent: %f\n",
pack_time / total_time * 100);
fprintf(stderr, "pull sparse local time percent: %f\n",
pull_sparse_local_time / total_time * 100);
fprintf(stderr, "collect label time percent: %f\n",
collect_label_time / total_time * 100);
fprintf(stderr, "fill sparse time percent: %f\n",
fill_sparse_time / total_time * 100);
fprintf(stderr, "push sparse time percent: %f\n",
push_sparse_time / total_time * 100);
fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time);
}
}
break;
}
}
}
if (copy_table_config_.need_copy()) {
CopySparseTable();
CopyDenseTable();
CopyDenseVars();
}
}
void HeterCpuWorker::TrainFiles() {
VLOG(3) << "Begin to train files";
platform::SetNumThreads(1);
device_reader_->Start();
int batch_cnt = 0;
int done_cnt = 0;
int cur_batch;
wait_queue_.SetCap(1);
need_to_push_dense_ = false;
while (1) {
// if (copy_table_config_.need_copy()) {
// if (copy_table_config_.sparse_copy_by_feasign()) {
// for (size_t i = 0; i < copy_sparse_tables_.size(); ++i) {
// uint64_t tid = copy_sparse_tables_[i].first;
// feasign_set_[tid].insert(sparse_push_keys_[tid].begin(),
// sparse_push_keys_[tid].end());
// }
// }
// if (batch_cnt % copy_table_config_.batch_num() == 0) {
// CopySparseTable();
// CopyDenseTable();
// CopyDenseVars();
// }
// }
std::shared_ptr<HeterTask> task;
task = run_queue_.Get();
if (!task) {
cur_batch = device_reader_->Next();
if (cur_batch <= 0) {
if (batch_cnt == done_cnt) {
break;
} else {
continue;
}
}
batch_cnt += 1;
int taskid = batch_cnt * worker_num_ + thread_id_;
task = object_pool_.Get();
task->Reset();
task->PackTask(thread_scope_, taskid, device_reader_, cur_batch,
program_);
}
for (;;) {
// pull sparse here
if (task->state_ == PULL_SPARSE) {
VLOG(3) << "pull sparse taskid = " << task->taskid_;
for (int i = 0;
i < param_.program_config(0).pull_sparse_table_id_size(); ++i) {
uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).pull_sparse_table_id(i));
TableParameter table;
for (auto j : param_.sparse_table()) {
if (j.table_id() == tid) {
table = j;
break;
}
}
fleet_ptr_->HeterPullSparseVars(
thread_id_, task, tid, sparse_key_names_[tid], table.fea_dim(),
sparse_value_names_[tid]);
}
task->Update();
// JumpContext(task);
// break;
} else if (task->state_ == OP_RUN) {
VLOG(3) << "oprun taskid = " << task->taskid_;
for (int i = 0;
i < param_.program_config(0).pull_sparse_table_id_size(); ++i) {
uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).pull_sparse_table_id(i));
CollectLabelInfo(task, i);
FillSparseValue(task, i);
auto nid_iter = std::find(sparse_value_names_[tid].begin(),
sparse_value_names_[tid].end(),
adjust_ins_weight_config_.nid_slot());
if (nid_iter != sparse_value_names_[tid].end()) {
AdjustInsWeight(task);
}
}
VLOG(3) << "fill sparse value for all sparse table done.";
// do computation here
for (int i = 0; i < xpu_begin_op_index_; ++i) {
auto& op = ops_[i];
bool need_skip = false;
for (auto t = 0u; t < skip_ops_.size(); ++t) {
if (op->Type().find(skip_ops_[t]) != std::string::npos) {
need_skip = true;
break;
}
}
if (!need_skip) {
VLOG(3) << "run op: " << op->Type();
op->Run(*(task->scope_), place_);
}
}
task->Update();
} else if (task->state_ == XPU) {
VLOG(3) << "call remote xpu taskid = " << task->taskid_;
std::vector<std::string> send_var_list;
for (int i = 0; i < trainer_desc_.xpu_recv_list_size(); ++i) {
send_var_list.push_back(trainer_desc_.xpu_recv_list(i));
}
heter_ptr_->CallRemoteXpu(task, this, mpi_rank_, send_var_list);
task->Update();
JumpContext(task);
break;
} else if (task->state_ == OP_RUN_END) {
for (size_t i = xpu_end_op_index_ + 1; i < ops_.size(); ++i) {
auto& op = ops_[i];
bool need_skip = false;
for (auto t = 0u; t < skip_ops_.size(); ++t) {
if (op->Type().find(skip_ops_[t]) != std::string::npos) {
need_skip = true;
break;
}
}
if (!need_skip) {
op->Run(*(task->scope_), place_);
}
}
// check inf and nan
for (std::string& var_name : check_nan_var_names_) {
Variable* var = (task->scope_)->FindVar(var_name);
if (var == nullptr) {
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (tensor == nullptr) {
continue;
}
}
task->Update();
} else if (task->state_ == PUSH_GRAD) {
VLOG(3) << "push grad taskid = " << task->taskid_;
if (need_to_push_sparse_) {
// push gradients here
for (int i = 0;
i < param_.program_config(0).push_sparse_table_id_size(); ++i) {
uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_sparse_table_id(i));
TableParameter table;
for (auto i : param_.sparse_table()) {
if (i.table_id() == tid) {
table = i;
break;
}
}
fleet_ptr_->HeterPushSparseVars(
task, tid, sparse_key_names_[tid], sparse_grad_names_[tid],
table.emb_dim(), &push_sparse_status_, use_cvm_, dump_slot_,
no_cvm_);
}
}
if (need_to_push_sparse_) {
VLOG(3) << "push sparse gradient done.";
int32_t tmp_push_sparse_wait_times = -1;
static uint32_t push_sparse_wait_times =
static_cast<uint32_t>(tmp_push_sparse_wait_times);
if (push_sparse_status_.size() >= push_sparse_wait_times) {
for (auto& t : push_sparse_status_) {
t.wait();
}
push_sparse_status_.resize(0);
}
if (tmp_push_sparse_wait_times == -1) {
push_sparse_status_.resize(0);
}
}
// if (need_dump_field_) {
// size_t batch_size = device_reader_->GetCurBatchSize();
// std::vector<std::string> ars(batch_size);
// for (auto& ar : ars) {
// ar.clear();
// }
// auto& ins_id_vec = device_reader_->GetInsIdVec();
// auto& ins_content_vec = device_reader_->GetInsContentVec();
// for (size_t i = 0; i < ins_id_vec.size(); i++) {
// ars[i] += ins_id_vec[i];
// ars[i] = ars[i] + "\t" + ins_content_vec[i];
// }
// for (auto& field : dump_fields_) {
// Variable* var = thread_scope_->FindVar(field);
// if (var == nullptr) {
// continue;
// }
// LoDTensor* tensor = var->GetMutable<LoDTensor>();
// if (!CheckValidOutput(tensor, batch_size)) {
// continue;
// }
// for (size_t i = 0; i < batch_size; ++i) {
// auto output_dim = tensor->dims()[1];
// std::string output_dimstr =
// boost::lexical_cast<std::string>(output_dim);
// ars[i] = ars[i] + "\t" + field + ":" + output_dimstr;
// auto bound = GetTensorBound(tensor, i);
// ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
// }
// }
// // #pragma omp parallel for
// for (size_t i = 0; i < ars.size(); i++) {
// if (ars[i].length() == 0) {
// continue;
// }
// writer_ << ars[i];
// }
// if (need_dump_param_ && thread_id_ == 0) {
// DumpParam();
// }
// }
// thread_scope_->DropKids();
task->Update();
} else if (task->state_ == DONE) {
VLOG(3) << "done taskid = " << task->taskid_;
object_pool_.Push(task);
PrintFetchVars();
++done_cnt;
// ++batch_cnt;
break;
}
}
}
if (need_dump_field_) {
// writer_.Flush();
}
if (copy_table_config_.need_copy()) {
CopySparseTable();
CopyDenseTable();
CopyDenseVars();
}
}
} // end namespace framework
} // end namespace paddle
#endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstdlib>
#include <ctime>
#include <string>
#include <vector>
#include "io/fs.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
#include "paddle/fluid/platform/cuda_device_guard.h"
namespace paddle {
namespace framework {
void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
Dataset* dataset) {
srand((unsigned)time(NULL));
param_ = trainer_desc.downpour_param();
for (int i = 0; i < param_.dense_table_size(); ++i) {
uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id());
auto table = param_.dense_table(i);
dense_grad_names_[table_id].resize(table.dense_grad_name_size());
for (int j = 0; j < table.dense_grad_name_size(); ++j) {
dense_grad_names_[table_id][j] = table.dense_grad_name(j);
}
}
scale_datanorm_ = trainer_desc.scale_datanorm();
int place_num = trainer_desc.worker_places_size();
for (int i = 0; i < place_num; ++i) {
int num = trainer_desc.worker_places(i);
platform::CUDAPlace place = platform::CUDAPlace(num);
platform::CUDADeviceGuard guard(place.device);
cudaStream_t stream;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
copy_streams_.push_back(stream);
places_.push_back(place);
cudaEvent_t event;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
events_.push_back(event);
}
// thread_num_ = trainer_desc.thread_num();
// SetDataset(dataset);
// dump_fields_path_ = trainer_desc.dump_fields_path();
// dump_converter_ = trainer_desc.dump_converter();
// need_dump_field_ = false;
// if (trainer_desc.dump_fields_size() != 0 && dump_fields_path_ != "") {
// need_dump_field_ = true;
// }
// if (need_dump_field_) {
// auto &file_list = dataset->GetFileList();
// if (file_list.size() == 0) {
// need_dump_field_ = false;
// }
// }
// mpi_rank_ = trainer_desc.mpi_rank();
// mpi_size_ = trainer_desc.mpi_size();
// dump_file_num_ = trainer_desc.dump_file_num();
// const std::vector<paddle::framework::DataFeed *> readers =
// dataset->GetReaders();
// thread_num_ = readers.size();
for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
i++) {
need_merge_var_names_.push_back(
trainer_desc.downpour_param().stat_var_names(i));
}
running_ = true;
VLOG(3) << "going to initialize pull dense worker";
pull_dense_worker_ = PullDenseWorker::GetInstance();
pull_dense_worker_->Initialize(trainer_desc);
VLOG(3) << "initialize pull dense worker";
SetDebug(trainer_desc.debug());
fleet_ptr_ = FleetWrapper::GetInstance();
heter_ptr_ = HeterWrapper::GetInstance();
RegisterServiceHandler();
// for (int i = 0; i < trainer_desc.worker_places_size(); ++i) {
// int num = trainer_desc.worker_places(i);
// platform::CUDAPlace place = platform::CUDAPlace(num);
// platform::CUDADeviceGuard guard(place.device);
// cudaStream_t stream;
// PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
// copy_streams_.push_back(stream);
// places_.push_back(place);
// }
trainer_desc_ = trainer_desc;
}
void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
auto place = places_[num];
Scope* scope = place_scopes_[num];
auto stream = copy_streams_[num];
auto event = events_[num];
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id);
auto& block = program.Block(0);
for (auto& var : block.AllVars()) {
if (var->Persistable()) {
auto name = var->Name();
Variable* root_var = root_scope_->FindVar(name);
LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
auto* ptr = scope->Var(name);
InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
LoDTensor* thread_tensor = ptr->GetMutable<LoDTensor>();
#define HeterMemcpyFunc(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
} \
} while (0)
_ForEachDataType_(HeterMemcpyFunc);
}
}
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
cudaEventSynchronize(event);
}
template <typename T>
void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
LoDTensor* root_tensor,
const paddle::platform::Place& thread_place,
cudaStream_t stream) {
T* thread_ptr =
thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
T* root_ptr = root_tensor->data<T>();
if (platform::is_cpu_place(root_tensor->place())) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
platform::CPUPlace(), root_ptr,
sizeof(T) * root_tensor->numel(), stream);
} else {
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()),
root_ptr, sizeof(T) * root_tensor->numel(), stream);
}
}
void HeterXpuTrainer::DumpWork(int tid) {}
void HeterXpuTrainer::InitTrainerEnv(const ProgramDesc& main_program,
const platform::Place& place) {
CacheProgram(main_program);
place_ = place;
auto& profiler = paddle::ps::CostProfiler::instance();
profiler.register_profiler("xpu_service_run_task");
profiler.register_profiler("xpu_service_deserial");
profiler.register_profiler("xpu_service_launch_kernel");
profiler.register_profiler("xpu_service_wait");
}
void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
auto& block = main_program.Block(0);
pull_dense_worker_->SetRootScope(root_scope_);
pull_dense_worker_->CreatePinVar();
for (size_t i = 0; i < places_.size(); ++i) {
Scope* scope = &(root_scope_->NewScope());
// for (auto &var : block.AllVars()) {
// if (var->Persistable()) {
// auto *ptr = scope->Var(var->Name());
// InitializeVariable(ptr, var->GetType());
// }
// }
place_scopes_.push_back(scope);
CreateThreadParam(main_program, i);
pull_dense_worker_->AddThreadScope(scope);
pull_dense_worker_->AddPlace(places_[i]);
pull_dense_worker_->AddStream(copy_streams_[i]);
}
pull_dense_worker_->Start();
for (auto& stream : copy_streams_) {
cudaStreamSynchronize(stream);
}
op_names_.clear();
for (auto& op_desc : block.AllOps()) {
std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
op_names_.push_back(op_desc->Type());
OperatorBase* local_op_ptr = local_op.release();
ops_.push_back(local_op_ptr);
continue;
}
xpu_begin_op_index_ = xpu_end_op_index_ = -1;
xpu_begin_op_index_ = trainer_desc_.xpu_start_idx();
xpu_end_op_index_ = trainer_desc_.xpu_end_idx();
VLOG(0) << "xpu begin: " << xpu_begin_op_index_
<< " xpu end: " << xpu_end_op_index_;
// CHECK(xpu_begin_op_index_ == 0);
// CHECK(xpu_end_op_index_ = ops_.size() - 1);
//// init pool
for (size_t i = 0; i < 6; ++i) {
for (size_t j = 0; j < places_.size(); ++j) {
int num = j;
std::shared_ptr<HeterServiceContext> context =
std::make_shared<HeterServiceContext>();
context->place_num_ = num;
auto place = places_[num];
context->scope_ = &(place_scopes_[num]->NewScope());
auto& block = program_.Block(0);
for (auto& var : block.AllVars()) {
if (!var->Persistable()) {
auto* ptr = context->scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
}
}
for (auto& v : dense_grad_names_) {
for (auto& name : v.second) {
auto* ptr = context->scope_->Var(name + "pin");
InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
}
}
for (auto& op_desc : block.AllOps()) {
std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
OperatorBase* local_op_ptr = local_op.release();
(context->ops_).push_back(local_op_ptr);
}
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
object_pool_.Push(context);
}
}
VLOG(3) << "init other env done.";
}
void HeterXpuTrainer::Run() {}
int HeterXpuTrainer::EndPass(const HeterRequest* request,
HeterResponse* response) {
// int scope_num = object_pool_.Size();
for (size_t i = 0; i < need_merge_var_names_.size(); i++) {
Variable* root_var = root_scope_->FindVar(need_merge_var_names_[i]);
if (root_var == nullptr) {
continue;
}
LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
for (size_t j = 0; j < place_scopes_.size(); j++) {
Scope* cur_thread_scope = place_scopes_[j];
Variable* thread_var =
cur_thread_scope->FindVar(need_merge_var_names_[i]);
if (thread_var == nullptr) {
continue;
}
LoDTensor* thread_tensor = thread_var->GetMutable<LoDTensor>();
// if (root_tensor->numel() != thread_tensor->numel()) {
// continue;
// }
#define MergeCallback(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
if (thread_tensor->type() != proto_type) { \
VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \
<< "] " << need_merge_var_names_[i] \
<< ", root tensor type=" << root_tensor->type() \
<< ", thread tensor type=" << thread_tensor->type(); \
exit(-1); \
} \
MergeToRootScope<cpp_type>(root_tensor, thread_tensor); \
} \
} while (0)
_ForEachDataType_(MergeCallback);
if (platform::is_gpu_place(thread_tensor->place())) {
auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id);
cudaMemset(thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type()));
} else {
memset(thread_tensor->data<void>(), 0,
thread_tensor->numel() * SizeOfType(thread_tensor->type()));
}
}
auto* merge_var = response->add_vars();
heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_,
merge_var);
if (platform::is_gpu_place(root_tensor->place())) {
auto dev_id =
BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
platform::CUDADeviceGuard guard(dev_id);
cudaMemset(root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type()));
} else {
memset(root_tensor->data<void>(), 0,
root_tensor->numel() * SizeOfType(root_tensor->type()));
}
}
return 0;
}
template <typename T>
void HeterXpuTrainer::MergeToRootScope(LoDTensor* root_tensor,
LoDTensor* tensor) {
LoDTensor tmp_root;
TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root);
T* tmp_root_data = tmp_root.data<T>();
LoDTensor tmp_tensor;
TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor);
T* data = tmp_tensor.data<T>();
for (int i = 0; i < tmp_tensor.numel(); i++) {
tmp_root_data[i] += data[i];
}
TensorCopy(tmp_root, root_tensor->place(), root_tensor);
}
int HeterXpuTrainer::StopService(const HeterRequest* request,
HeterResponse* response) {
std::unique_lock<std::mutex> lock(mutex_);
running_ = false;
cond_.notify_one();
return 0;
}
int HeterXpuTrainer::RunTask(const HeterRequest* request,
HeterResponse* response) {
auto timer = std::make_shared<paddle::ps::CostTimer>("xpu_service_run_task");
std::shared_ptr<HeterServiceContext> context = object_pool_.Get();
if (!context->scope_) {
int num = rand() % places_.size();
context->place_num_ = num;
auto place = places_[num];
context->scope_ = &(place_scopes_[num]->NewScope());
auto& block = program_.Block(0);
for (auto& var : block.AllVars()) {
if (!var->Persistable()) {
auto* ptr = context->scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
}
}
for (auto& v : dense_grad_names_) {
for (auto& name : v.second) {
auto* ptr = context->scope_->Var(name + "pin");
InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
}
}
for (auto& op_desc : block.AllOps()) {
std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
OperatorBase* local_op_ptr = local_op.release();
(context->ops_).push_back(local_op_ptr);
}
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::CUDADeviceGuard guard(dev_id);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
}
context->Reset();
auto place = places_[context->place_num_];
{
auto deserial_timer =
std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial");
for (int i = 0; i < request->vars_size(); ++i) {
heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place,
copy_streams_[context->place_num_]);
}
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
while (cudaEventQuery(context->event_) != cudaSuccess) {
VLOG(3) << "wait for kernel";
bthread_yield();
}
}
{
auto launch_timer =
std::make_shared<paddle::ps::CostTimer>("xpu_service_launch_kernel");
for (int i = xpu_begin_op_index_; i <= xpu_end_op_index_; ++i) {
auto& op = (context->ops_)[i];
op->Run(*(context->scope_), place);
}
}
auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(context->event_, dev_ctx->stream()));
// cudaEventSynchronize(context->event_);
{
auto wait_timer =
std::make_shared<paddle::ps::CostTimer>("xpu_service_wait");
while (cudaEventQuery(context->event_) != cudaSuccess) {
VLOG(3) << "wait for kernel";
bthread_yield();
}
}
for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) {
const std::string& varname = trainer_desc_.xpu_send_list(i);
// CHECK(varname == "concat_1.tmp_0@GRAD");
auto* res_var = response->add_vars();
heter_ptr_->SerializeToReq(varname, context->scope_, res_var);
}
// std::string varname = "concat_1.tmp_0@GRAD";
//
// auto* res_var = response->add_vars();
// heter_ptr_->SerializeToReq(varname, context->scope_, res_var);
for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
++i) {
uint64_t tid =
static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
fleet_ptr_->PushDenseVarsAsync(
*(context->scope_), tid, dense_grad_names_[tid],
&(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
places_[context->place_num_], copy_streams_[context->place_num_],
context->event_);
}
for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
++i) {
uint64_t tid =
static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
pull_dense_worker_->IncreaseThreadVersion(0, tid);
}
VLOG(3) << "push dense gradient done.";
context->scope_->DropKids();
object_pool_.Push(context);
VLOG(0) << "pool size " << object_pool_.Size();
return 0;
}
void HeterXpuTrainer::RegisterServiceHandler() {
heter_ptr_->RegisterServiceHandler(
0, [this](const HeterRequest* request, HeterResponse* response) -> int {
return this->RunTask(request, response);
});
heter_ptr_->RegisterServiceHandler(
1, [this](const HeterRequest* request, HeterResponse* response) -> int {
return this->EndPass(request, response);
});
heter_ptr_->RegisterServiceHandler(
2, [this](const HeterRequest* request, HeterResponse* response) -> int {
return this->StopService(request, response);
});
}
Scope* HeterXpuTrainer::GetWorkerScope(int thread_id) { return nullptr; }
void HeterXpuTrainer::Finalize() {
// for (auto &th : threads_) {
// th.join();
// }
std::unique_lock<std::mutex> lock(mutex_);
cond_.wait(lock, [this] { return !running_; });
sleep(3);
pull_dense_worker_->Stop();
root_scope_->DropKids();
}
} // namespace framework
} // namespace paddle
#endif
......@@ -102,6 +102,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
workers_[i]->SetRootScope(root_scope_);
workers_[i]->CreateDeviceResource(main_program); // Program
workers_[i]->BindingDataFeedMemory();
workers_[i]->CacheProgram(main_program);
}
}
......
......@@ -56,6 +56,34 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
current_version_[tid] = 0;
}
fleet_ptr_ = FleetWrapper::GetInstance();
#ifdef PADDLE_WITH_CUDA
copy_streams_.clear();
places_.clear();
thread_scopes_.clear();
#endif
}
void PullDenseWorker::CreatePinVar() {
#ifdef PADDLE_WITH_CUDA
// for (auto& v : dense_value_names_) {
// for (auto& name : v.second) {
for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
++i) {
uint64_t tid = static_cast<uint64_t>(
dwp_param_.program_config(0).pull_dense_table_id(i));
for (size_t j = 0; j < dense_value_names_[tid].size(); j++) {
auto& name = dense_value_names_[tid][j];
Variable* var = root_scope_->FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
auto* ptr = root_scope_->Var(name + "pin");
InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
pin_tensor->mutable_data<float>(tensor->dims(),
platform::CUDAPinnedPlace());
}
}
#endif
}
void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
......@@ -75,6 +103,31 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
exit(-1);
}
status_vec->resize(0);
#ifdef PADDLE_WITH_CUDA
for (size_t i = 0; i < places_.size(); ++i) {
// for (auto& v : dense_value_names_) {
// for (auto& name : v.second) {
for (int x = 0; x < dwp_param_.program_config(0).pull_dense_table_id_size();
++x) {
uint64_t tid = static_cast<uint64_t>(
dwp_param_.program_config(0).pull_dense_table_id(x));
for (size_t j = 0; j < dense_value_names_[tid].size(); j++) {
auto& name = dense_value_names_[tid][j];
Variable* pin_var = root_scope_->FindVar(name + "pin");
LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
float* pin_w = pin_tensor->data<float>();
Variable* var = thread_scopes_[i]->FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* w = tensor->data<float>();
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
platform::CUDAPinnedPlace(), pin_w,
sizeof(float) * tensor->numel(), copy_streams_[i]);
}
}
}
#endif
}
void PullDenseWorker::Stop() {
......@@ -91,8 +144,14 @@ void PullDenseWorker::PullDense(bool force_update) {
uint64_t tid = static_cast<uint64_t>(
dwp_param_.program_config(0).pull_dense_table_id(i));
if (force_update || CheckUpdateParam(tid)) {
#ifdef PADDLE_WITH_CUDA
VLOG(3) << "pull dense " << force_update << " " << tid;
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_);
&pull_dense_status_, false);
#else
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, true);
#endif
ResetThreadVersion(tid);
}
}
......
......@@ -21,9 +21,12 @@ limitations under the License. */
#include <thread> // NOLINT
#include <vector>
#include <ctime>
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/reader.h"
......@@ -62,6 +65,7 @@ class TrainerBase {
Scope* root_scope_;
bool debug_;
Dataset* dataset_ptr_;
TrainerDesc trainer_desc_;
// For dump param or field
bool need_dump_field_ = false;
......@@ -118,10 +122,86 @@ class DistMultiTrainer : public MultiTrainer {
void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
virtual void InitDumpEnv();
virtual Scope* GetWorkerScope(int thread_id);
virtual void RegisterHeterCallback();
protected:
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
};
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
class HeterServiceContext {
public:
HeterServiceContext() {}
virtual ~HeterServiceContext() {
for (OperatorBase* op : ops_) {
delete op;
}
std::vector<OperatorBase*>().swap(ops_);
}
void Reset() { push_dense_status_.clear(); }
int place_num_;
Scope* scope_{nullptr};
cudaEvent_t event_;
std::vector<OperatorBase*> ops_;
std::vector<::std::future<int32_t>> push_dense_status_;
};
class HeterXpuTrainer : public TrainerBase {
public:
HeterXpuTrainer() {}
virtual ~HeterXpuTrainer() {
for (OperatorBase* op : ops_) {
delete op;
}
std::vector<OperatorBase*>().swap(ops_);
}
virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set);
virtual void InitTrainerEnv(const ProgramDesc& main_program,
const platform::Place& place);
virtual void InitOtherEnv(const ProgramDesc& main_program);
virtual void Run();
virtual void Finalize();
virtual void DumpWork(int tid);
virtual void RegisterServiceHandler();
virtual int RunTask(const HeterRequest* request, HeterResponse* response);
virtual Scope* GetWorkerScope(int thread_id);
virtual void CacheProgram(const ProgramDesc& main_program) {
new (&program_) ProgramDesc(main_program);
}
template <typename T>
void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place,
cudaStream_t stream);
void CreateThreadParam(const ProgramDesc& program, int num);
template <typename T>
void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
int EndPass(const HeterRequest* request, HeterResponse* response);
int StopService(const HeterRequest* request, HeterResponse* response);
protected:
DownpourWorkerParameter param_;
std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
std::vector<std::string> need_merge_var_names_;
float scale_datanorm_;
int xpu_begin_op_index_;
int xpu_end_op_index_;
bool running_;
paddle::platform::Place place_;
std::mutex mutex_;
ProgramDesc program_;
std::condition_variable cond_;
std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
std::shared_ptr<paddle::framework::HeterWrapper> heter_ptr_;
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
std::vector<OperatorBase*> ops_;
std::vector<std::string> op_names_;
std::vector<Scope*> place_scopes_;
BtObjectPool<HeterServiceContext> object_pool_;
std::vector<cudaStream_t> copy_streams_;
std::vector<platform::Place> places_;
std::vector<cudaEvent_t> events_;
};
#endif
#if defined(PADDLE_WITH_NCCL)
class PipelineTrainer : public TrainerBase {
......
......@@ -52,6 +52,12 @@ message TrainerDesc {
optional bool enable_random_dump = 24 [ default = false ];
optional bool random_with_lineid = 25 [ default = false ];
optional int32 dump_interval = 26 [ default = 10000 ];
repeated int32 worker_places = 27;
repeated string xpu_send_list = 28;
repeated string xpu_recv_list = 29;
optional int32 xpu_start_idx = 30;
optional int32 xpu_end_idx = 31;
// device worker parameters
optional HogwildWorkerParameter hogwild_param = 101;
......
......@@ -63,6 +63,9 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS(MultiTrainer);
REGISTER_TRAINER_CLASS(DistMultiTrainer);
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS(HeterXpuTrainer);
#endif
#if defined(PADDLE_WITH_NCCL)
REGISTER_TRAINER_CLASS(PipelineTrainer);
#endif
......
......@@ -241,6 +241,156 @@ class Flatten2GradOp : public framework::OperatorWithKernel {
}
};
class FlattenContiguousRangeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FlattenContiguousRange");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
"FlattenContiguousRange");
const auto &start_axis = ctx->Attrs().Get<int>("start_axis");
const auto &stop_axis = ctx->Attrs().Get<int>("stop_axis");
const auto &in_dims = ctx->GetInputDim("X");
int in_dims_size = in_dims.size();
int real_start_axis = start_axis, real_stop_axis = stop_axis;
if (start_axis < 0) {
real_start_axis = start_axis + in_dims_size;
}
if (stop_axis < 0) {
real_stop_axis = stop_axis + in_dims_size;
}
PADDLE_ENFORCE_GE(
real_stop_axis, real_start_axis,
platform::errors::InvalidArgument("The stop_axis should be greater"
"than or equal to start_axis."));
const auto &out_dims =
GetOutputShape(real_start_axis, real_stop_axis, in_dims);
ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
if (in_dims[0] == out_dims[0]) {
// Only pass LoD when the first dimension of output and Input(X)
// are the same.
ctx->ShareLoD("X", "Out");
}
OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2");
std::vector<int64_t> xshape_dims(in_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < in_dims.size(); ++i) {
xshape_dims[i + 1] = in_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", "XShape");
}
static std::vector<int32_t> GetOutputShape(const int start_axis,
const int stop_axis,
const framework::DDim &in_dims) {
int64_t outer = 1;
std::vector<int32_t> out_shape;
int in_dims_size = in_dims.size();
out_shape.reserve(in_dims_size - stop_axis + start_axis);
for (int i = 0; i < start_axis; ++i) {
out_shape.push_back(in_dims[i]);
}
for (int i = start_axis; i <= stop_axis; i++) {
outer *= in_dims[i];
}
out_shape.push_back(outer);
for (int i = stop_axis + 1; i < in_dims_size; i++) {
out_shape.push_back(in_dims[i]);
}
return out_shape;
}
};
class FlattenContiguousRangeOpMaker : public FlattenOpMaker {
public:
void Make() override {
AddInput("X", "(Tensor) A tensor of rank >= axis.");
AddOutput("Out",
"A 2D tensor is reshaped input tensor. The input dimensions"
"up to axis are flattened to the outer dimension of the output"
"and the remaining input dimensions are flattened into the inner"
"dimension of the output.");
AddAttr<int>("start_axis",
"(int)"
"Indicate the input start dimension (exclusive) to flatten")
.SetDefault(1);
AddAttr<int>("stop_axis",
"(int)"
"Indicate the input stop dimension (exclusive) to flatten")
.SetDefault(1);
AddComment(R"DOC(
Flatten Operator
Flattens the input tensor into a new matrix according to start_axis and stop_axis.
Examples:
Case 1:
Given
X.shape = (3, 100, 100, 4)
and
start_axis = 2, stop_axis = -1
We get:
Out.shape = (3, 100, 400)
Case 2:
Given
X.shape = (3, 100, 100, 4)
and
start_axis = 0, stop_axis = -1
We get:
Out.shape = (3 * 100 * 100 * 4)
)DOC");
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in FlattenGradOp.")
.AsIntermediate();
}
};
template <typename T>
class FlattenContiguousRangeGradOpMaker
: public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
void Apply(GradOpPtr<T> grad_op) const override {
grad_op->SetType("flatten_contiguous_range_grad");
grad_op->SetInput("XShape", this->Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
grad_op->SetAttrMap(this->Attrs());
}
};
class FlattenContiguousRangeGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *context) const override {
OP_INOUT_CHECK(context->HasInput("XShape"), "Input", "XShape",
"FlattenContiguousRangeGrad");
OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), "Input",
framework::GradVarName("Out"), "FlattenContiguousRangeGrad");
auto xshape_dims = context->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
context->SetOutputDim(framework::GradVarName("X"), x_dims);
context->ShareLoD("XShape", framework::GradVarName("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")),
ctx.device_context());
}
};
DECLARE_INPLACE_OP_INFERER(FlattenOpInplaceInferer, {"X", "Out"});
DECLARE_INPLACE_OP_INFERER(FlattenGradInplaceInferer,
{framework::GradVarName("Out"),
......@@ -266,6 +416,16 @@ REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
ops::FlattenGradInplaceInferer);
REGISTER_OPERATOR(
flatten_contiguous_range, ops::FlattenContiguousRangeOp,
ops::FlattenContiguousRangeOpMaker,
ops::FlattenContiguousRangeGradOpMaker<paddle::framework::OpDesc>,
ops::FlattenContiguousRangeGradOpMaker<paddle::imperative::OpBase>,
ops::FlattenOpInplaceInferer);
REGISTER_OPERATOR(flatten_contiguous_range_grad,
ops::FlattenContiguousRangeGradOp,
ops::FlattenGradInplaceInferer);
REGISTER_OP_CPU_KERNEL(
flatten, ops::FlattenKernel<paddle::platform::CPUDeviceContext, float>,
ops::FlattenKernel<paddle::platform::CPUDeviceContext, double>,
......@@ -292,3 +452,26 @@ REGISTER_OP_CPU_KERNEL(
ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int>,
ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
flatten_contiguous_range,
ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
float>,
ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
double>,
ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext, int>,
ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
int8_t>,
ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
int64_t>);
REGISTER_OP_CPU_KERNEL(
flatten_contiguous_range_grad,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
float>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
double>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
int>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
int8_t>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
int64_t>);
......@@ -42,3 +42,26 @@ REGISTER_OP_CUDA_KERNEL(
ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int>,
ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
flatten_contiguous_range,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
float>,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
double>,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext, int>,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
int8_t>,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
int64_t>);
REGISTER_OP_CUDA_KERNEL(
flatten_contiguous_range_grad,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
float>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
double>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
int>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
int8_t>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);
......@@ -112,5 +112,73 @@ class Flatten2GradKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto &start_axis = context.Attr<int>("start_axis");
auto &stop_axis = context.Attr<int>("stop_axis");
auto *in = context.Input<framework::LoDTensor>("X");
auto x_dims = in->dims();
int in_dims_size = x_dims.size();
int real_start_axis = start_axis, real_stop_axis = stop_axis;
if (start_axis < 0) {
real_start_axis = start_axis + in_dims_size;
}
if (stop_axis < 0) {
real_stop_axis = stop_axis + in_dims_size;
}
auto *out = context.Output<framework::LoDTensor>("Out");
auto out_dims = framework::make_ddim(
GetOutputShape(real_start_axis, real_stop_axis, x_dims));
out->mutable_data(context.GetPlace(), in->type());
framework::TensorCopy(
*in, context.GetPlace(),
context.template device_context<platform::DeviceContext>(), out);
out->Resize(out_dims);
}
static std::vector<int32_t> GetOutputShape(const int start_axis,
const int stop_axis,
const framework::DDim &in_dims) {
int64_t outer = 1;
std::vector<int32_t> out_shape;
int in_dims_size = in_dims.size();
out_shape.reserve(in_dims_size - stop_axis + start_axis);
for (int i = 0; i < start_axis; ++i) {
out_shape.push_back(in_dims[i]);
}
for (int i = start_axis; i <= stop_axis; i++) {
outer *= in_dims[i];
}
out_shape.push_back(outer);
for (int i = stop_axis + 1; i < in_dims_size; i++) {
out_shape.push_back(in_dims[i]);
}
return out_shape;
}
};
template <typename DeviceContext, typename T>
class FlattenContiguousRangeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
auto *d_out =
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
auto xshape_dims = ctx.Input<framework::LoDTensor>("XShape")->dims();
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
d_x->mutable_data(ctx.GetPlace(), d_out->type());
framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
d_x->Resize(x_dims);
}
};
} // namespace operators
} // namespace paddle
......@@ -7,7 +7,12 @@ register_operators(EXCLUDES
fused_fc_elementwise_layernorm_op
multihead_matmul_op
fused_embedding_eltwise_layernorm_op
fusion_group_op)
fusion_group_op
fusion_gru_op)
# fusion_gru_op does not have CUDA kernel
op_library(fusion_gru_op)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_gru);\n")
if (WITH_GPU)
# fused_bn_activation_op needs cudnn 7.4.1 above
......
......@@ -19,6 +19,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
......@@ -122,8 +125,17 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType FusionGRUOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN
if (platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context());
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout,
library);
}
void FusionGRUOpMaker::Make() {
......@@ -187,6 +199,9 @@ void FusionGRUOpMaker::Make() {
"bool"
"use origin mode in article https://arxiv.org/abs/1412.3555")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC(
The Fusion complete GRU Operator.
This operator fuse the fully-connected operator into GRU,
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_gru_op.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
using paddle::framework::LoDTensor;
using paddle::framework::Tensor;
using paddle::platform::CPUDeviceContext;
using paddle::platform::MKLDNNGetDataType;
using paddle::platform::MKLDNNMemDesc;
using platform::to_void_cast;
template <typename T>
class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
public:
GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
const platform::MKLDNNDeviceContext& dev_ctx,
const mkldnn::engine mkldnn_engine,
platform::Place cpu_place, const LoDTensor* input,
const Tensor* weight_h, const Tensor* h0,
const bool is_reverse, const int64_t N, const int64_t Ti,
const int64_t IC, const int64_t OC,
const std::string& unique_name)
: platform::MKLDNNHandlerT<T, dnnl::gru_forward>(
dev_ctx, dev_ctx.GetEngine(), cpu_place,
platform::CreateKey(unique_name, Ti)),
N(N),
Ti(Ti),
IC(IC),
OC(OC) {
// Create memory key without Ti because weights, bias and h0 memories
// do not depend on Ti size but primitive and input/output memory do
if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() !=
platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) {
memory_key_ = unique_name;
} else {
memory_key_ = unique_name + "-t:" + platform::ThreadIDasStr();
}
if (!this->isCached()) {
// oneDNN kernel has hardcoded activation functions
PADDLE_ENFORCE_EQ(
ctx.Attr<std::string>("gate_activation"), "sigmoid",
platform::errors::Unimplemented(
"oneDNN fusion_gru supports only sigmoid as a gate activation."));
PADDLE_ENFORCE_EQ(
ctx.Attr<std::string>("activation"), "tanh",
platform::errors::Unimplemented(
"oneDNN fusion_gru supports only tanh as an activation."));
// oneDNN RNN dimensions
const int64_t D = 1; // Directions
const int64_t L = 1; // Layers (PP supports only 1 stacked layer)
const int64_t G = 3; // Number of Gates, 3 for GRU
// Create memory descriptors
auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::any);
auto weight_x_md = MKLDNNMemDesc(
{L, D, IC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
auto weight_h_md = MKLDNNMemDesc(
{L, D, OC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldgo);
auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::any);
auto h0_md = dnnl::memory::desc();
if (h0) {
h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::ldnc);
}
// Create GRU oneDNN primitive
const auto direction =
is_reverse ? dnnl::rnn_direction::unidirectional_right2left
: dnnl::rnn_direction::unidirectional_left2right;
this->AcquireForwardPrimitiveDescriptor(
dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc());
}
}
bool is_NTC() {
return (platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc()) ==
dnnl::memory::format_tag::ntc);
}
void reorderRNNdata(const T* input_data, T* output_data,
std::vector<size_t> lod, const bool is_reverse,
platform::RNNReorderType reorder_type) {
switch (reorder_type) {
// Reorder input memory [WORDS, C] + LoD -> [N, T, C]
case platform::RNNReorderType::PP_NTC: {
auto* input_data_iter = input_data;
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * IC;
const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
memcpy(output_data + n * Ti * IC + offset, input_data_iter,
sizeof(T) * num_elements);
input_data_iter += num_elements;
}
} break;
// Reorder input memory [WORDS, C] + LoD -> [T, N, C]
case platform::RNNReorderType::PP_TNC: {
auto* input_data_iter = input_data;
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]);
const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data + (t + offset) * N * IC + n * IC,
input_data_iter, sizeof(T) * IC);
input_data_iter += IC;
}
}
} break;
// Reorder output values to PP format [N, T, C] -> [WORDS, C]
case platform::RNNReorderType::NTC_PP: {
auto* output_data_iter = output_data;
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * OC;
const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
memcpy(output_data_iter, input_data + n * Ti * OC + offset,
sizeof(T) * num_elements);
output_data_iter += num_elements;
}
} break;
// Reorder output values to PP format [T, N, C] -> [WORDS, C]
case platform::RNNReorderType::TNC_PP: {
auto* output_data_iter = output_data;
for (int n = 0; n < N; ++n) {
const auto num_elements = lod[n + 1] - lod[n];
const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data_iter,
input_data + (t + offset) * N * OC + n * OC, sizeof(T) * OC);
output_data_iter += OC;
}
}
} break;
}
}
std::shared_ptr<dnnl::memory> AcquireInputMemoryWithReorder(
const LoDTensor* input, const bool is_reverse) {
const auto name = this->key_ + "@input_mem";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_desc(),
this->engine_);
this->dev_ctx_.SetBlob(name, memory_p);
}
const auto& input_lod = input->lod()[0];
auto* x_data = input->data<T>();
auto* x_onednn_data = reinterpret_cast<T*>(memory_p->get_data_handle());
memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
dnnl::memory::format_tag::ntc) {
reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
platform::RNNReorderType::PP_NTC);
} else {
reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
platform::RNNReorderType::PP_TNC);
}
return memory_p;
}
std::shared_ptr<dnnl::memory> AcquireOutputMemory() {
const auto name = this->key_ + "@output_mem";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->dst_desc(),
this->engine_);
this->dev_ctx_.SetBlob(name, memory_p);
}
return memory_p;
}
std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
const std::string h0_key = memory_key_ + "@h0";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
auto* h0_data = to_void_cast(h0->data<T>());
if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>(
this->fwd_pd_->weights_layer_desc(), this->engine_, h0_data);
this->dev_ctx_.SetBlob(h0_key, memory_p);
} else {
memory_p->set_data_handle(h0_data);
}
return memory_p;
}
std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x,
const bool origin_mode) {
const std::string wx_key = memory_key_ + "@weight_x";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
if (!memory_p) {
auto user_md =
MKLDNNMemDesc({1, 1, IC, 3, OC}, MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldigo);
auto user_memory = dnnl::memory(user_md, this->engine_);
auto* weight_x_data =
reinterpret_cast<float*>(user_memory.get_data_handle());
memcpy(weight_x_data, weight_x->data<float>(),
sizeof(float) * IC * 3 * OC);
if (origin_mode == false) {
for (int64_t i = 0; i < IC; ++i) {
for (int64_t j = 0; j < OC; ++j) {
weight_x_data[j] *= -1;
}
weight_x_data += 3 * OC;
}
}
memory_p = std::make_shared<dnnl::memory>(
this->fwd_pd_->weights_layer_desc(), this->engine_);
dnnl::stream astream(this->engine_);
dnnl::reorder(user_memory, *memory_p)
.execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wx_key, memory_p);
}
return memory_p;
}
std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h,
const bool origin_mode) {
const std::string wh_key = memory_key_ + "@weight_h";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
if (!memory_p) {
auto user_md =
MKLDNNMemDesc({1, 1, OC, 3, OC}, MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldigo);
auto user_memory = dnnl::memory(user_md, this->engine_);
// Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
// oneDNN format [OC, 3OC]
auto* weight_h_data =
reinterpret_cast<float*>(user_memory.get_data_handle());
auto* user_weight_h_data = weight_h->data<float>();
auto src1_iter = user_weight_h_data;
auto src2_iter = user_weight_h_data + 2 * OC * OC;
for (int64_t c = 0; c < OC; ++c) {
memcpy(weight_h_data, src1_iter, 2 * OC * sizeof(float));
memcpy(weight_h_data + 2 * OC, src2_iter, OC * sizeof(float));
src1_iter += 2 * OC;
src2_iter += OC;
weight_h_data += 3 * OC;
}
weight_h_data = reinterpret_cast<float*>(user_memory.get_data_handle());
if (origin_mode == false) {
for (int64_t i = 0; i < OC; ++i) {
for (int64_t j = 0; j < OC; ++j) {
weight_h_data[j] *= -1;
}
weight_h_data += 3 * OC;
}
}
memory_p = std::make_shared<dnnl::memory>(
this->fwd_pd_->weights_iter_desc(), this->engine_);
dnnl::stream astream(this->engine_);
dnnl::reorder(user_memory, *memory_p)
.execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wh_key, memory_p);
}
return memory_p;
}
std::shared_ptr<dnnl::memory> AcquireBiasMemory(const Tensor* bias,
const bool origin_mode) {
const std::string bias_key = memory_key_ + "@bias";
auto memory_p = std::static_pointer_cast<dnnl::memory>(
this->dev_ctx_.GetBlob(bias_key));
if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->bias_desc(),
this->engine_);
auto* bias_data = reinterpret_cast<float*>(memory_p->get_data_handle());
if (bias) {
const float* user_bias_data =
bias->data<float>(); // Bias in oneDNN is always float
memcpy(bias_data, user_bias_data, sizeof(float) * 3 * OC);
} else {
// oneDNN always need bias memory, if it's not provided in PP, let
// oneDNN allocate memory and set it to 0
memset(bias_data, 0, sizeof(float) * 3 * OC);
}
if (origin_mode == false && bias) {
for (int64_t i = 0; i < OC; ++i) {
bias_data[i] *= -1;
}
}
this->dev_ctx_.SetBlob(bias_key, memory_p);
}
return memory_p;
}
private:
// RNN dimensions
// N - Batch Size
// Ti - Max sentence length
// IC - Input Channels
// OC - Output Channels
const int64_t N, Ti, IC, OC;
// Memory size of weights, bias and h0 does not depend
// on Ti size, thus we need another key to cache them
std::string memory_key_;
};
template <typename T>
class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
// Get Tensors
const auto* input = ctx.Input<LoDTensor>("X");
const auto* h0 = ctx.Input<Tensor>("H0");
const auto* weight_x = ctx.Input<Tensor>("WeightX");
const auto* weight_h = ctx.Input<Tensor>("WeightH");
const auto* bias = ctx.Input<Tensor>("Bias");
auto* hidden = ctx.Output<LoDTensor>("Hidden");
// Get attributes
const bool is_reverse = ctx.Attr<bool>("is_reverse");
const bool origin_mode = ctx.Attr<bool>("origin_mode");
// Get tensor dimensions
const auto x_dims = framework::vectorize(input->dims());
const auto weight_h_dims = framework::vectorize(weight_h->dims());
const auto& input_lod = input->lod()[0];
// Calculate RNN dimensions
const int64_t N = input_lod.size() - 1; // Number of sentences (batches)
const int64_t Ti = // Max length of the sentence in a batch
[&input_lod]() {
size_t res = 0;
for (size_t i = 0; i < (input_lod.size() - 1); ++i) {
res = std::max(res, input_lod[i + 1] - input_lod[i]);
}
return res;
}();
const int64_t IC = x_dims[1]; // Input channels
const int64_t OC = weight_h_dims[0]; // Output channels
GRUMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(),
input, weight_h, h0, is_reverse, N, Ti, IC, OC,
ctx.InputName("X") + ctx.InputName("WeightH"));
auto input_memory_p =
handler.AcquireInputMemoryWithReorder(input, is_reverse);
auto weight_x_memory_p =
handler.AcquireWeightXMemory(weight_x, origin_mode);
auto weight_h_memory_p =
handler.AcquireWeightHMemory(weight_h, origin_mode);
auto bias_memory_p = handler.AcquireBiasMemory(bias, origin_mode);
auto hidden_onednn_memory_p = handler.AcquireOutputMemory();
std::unordered_map<int, dnnl::memory> gru_args = {
{DNNL_ARG_SRC_LAYER, *input_memory_p},
{DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
{DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
{DNNL_ARG_BIAS, *bias_memory_p},
{DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
if (h0) {
auto h0_memory_p = handler.AcquireH0Memory(h0);
gru_args.insert({DNNL_ARG_SRC_ITER, *h0_memory_p});
}
auto gru_forward_p = handler.AcquireForwardPrimitive();
dnnl::stream astream(mkldnn_engine);
gru_forward_p->execute(astream, gru_args);
astream.wait();
auto* hidden_onednn_data =
reinterpret_cast<T*>(hidden_onednn_memory_p->get_data_handle());
auto* hidden_data = hidden->mutable_data<T>(ctx.GetPlace());
if (handler.is_NTC()) {
handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
is_reverse, platform::RNNReorderType::NTC_PP);
} else {
handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
is_reverse, platform::RNNReorderType::TNC_PP);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace,
ops::FusionGRUMKLDNNKernel<float>);
......@@ -181,6 +181,8 @@ inline mkldnn::memory::format_tag GetMKLDNNFormat(
if (inner_nblks == 0) {
if (strides[0] >= strides[1] && strides[1] >= strides[2]) {
return mkldnn::memory::format_tag::ncw;
} else if (strides[1] >= strides[0] && strides[0] >= strides[2]) {
return mkldnn::memory::format_tag::ntc;
} else {
return mkldnn::memory::format_tag::nwc;
}
......@@ -420,5 +422,7 @@ inline std::vector<std::vector<int64_t>> ToMkldnnPadding(
}
}
enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP };
} // namespace platform
} // namespace paddle
set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper infer_io_utils)
gloo_wrapper infer_io_utils heter_wrapper)
if (WITH_NCCL)
set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
......@@ -31,6 +31,7 @@ set(PYBIND_SRCS
global_value_getter_setter.cc
reader_py.cc
fleet_wrapper_py.cc
heter_wrapper_py.cc
gloo_wrapper_py.cc
box_helper_py.cc
data_set_py.cc
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fcntl.h>
#ifdef _POSIX_C_SOURCE
#undef _POSIX_C_SOURCE
#endif
#ifdef _XOPEN_SOURCE
#undef _XOPEN_SOURCE
#endif
#include <string>
#include <vector>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
#ifdef PADDLE_WITH_PSLIB
void BindHeterWrapper(py::module* m) {
py::class_<framework::HeterWrapper, std::shared_ptr<framework::HeterWrapper>>(
*m, "Heter")
.def(py::init([]() { return framework::HeterWrapper::GetInstance(); }))
.def("create_client2xpu_connection",
&framework::HeterWrapper::CreateClient2XpuConnection)
.def("set_xpu_list", &framework::HeterWrapper::SetXpuList)
.def("start_xpu_service", &framework::HeterWrapper::StartXpuService)
.def("end_pass", &framework::HeterWrapper::EndPass)
.def("stop_xpu_service", &framework::HeterWrapper::StopXpuService);
} // end HeterWrapper
#endif
} // end namespace pybind
} // end namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
#ifdef PADDLE_WITH_PSLIB
void BindHeterWrapper(py::module* m);
#endif
} // namespace pybind
} // namespace paddle
......@@ -66,6 +66,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/fleet_wrapper_py.h"
#include "paddle/fluid/pybind/global_value_getter_setter.h"
#include "paddle/fluid/pybind/gloo_wrapper_py.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/inference_api.h"
#include "paddle/fluid/pybind/ir.h"
......@@ -2479,6 +2480,9 @@ All parameter, weight, gradient are variables in Paddle.
.def("device_count", &ParallelExecutor::DeviceCount);
BindFleetWrapper(&m);
#ifdef PADDLE_WITH_PSLIB
BindHeterWrapper(&m);
#endif
BindGlooWrapper(&m);
BindBoxHelper(&m);
#ifdef PADDLE_WITH_BOX_PS
......
......@@ -16,10 +16,13 @@
from .base.distributed_strategy import DistributedStrategy
from .base.fleet_base import Fleet
from .base.util_factory import UtilBase
from .dataset import *
#from .base.role_maker import PaddleCloudRoleMaker
__all__ = ["DistributedStrategy", "UtilBase"]
__all__ = [
"DistributedStrategy", "UtilBase", "DatasetFactory", "DatasetBase",
"InMemoryDataset", "QueueDataset"
]
fleet = Fleet()
init = fleet.init
......
......@@ -10,3 +10,5 @@
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from .dataset import *
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This is definition of dataset class, which is high performance IO."""
import paddle
import paddle.fluid as fluid
from paddle.fluid.proto import data_feed_pb2
from google.protobuf import text_format
import paddle.fluid.core as core
class DatasetFactory(object):
"""
DatasetFactory is a factory which create dataset by its name,
you can create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
the default is "QueueDataset".
Example:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
"""
def __init__(self):
""" Init. """
pass
def create_dataset(self, datafeed_class="QueueDataset"):
"""
Create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
the default is "QueueDataset".
Args:
datafeed_class(str): datafeed class name, QueueDataset or InMemoryDataset.
Default is QueueDataset.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
"""
try:
dataset = globals()[datafeed_class]()
return dataset
except:
raise ValueError("datafeed class %s does not exist" %
datafeed_class)
class DatasetBase(object):
""" Base dataset class. """
def __init__(self):
""" Init. """
# define class name here
# to decide whether we need create in memory instance
self.proto_desc = data_feed_pb2.DataFeedDesc()
self.proto_desc.pipe_command = "cat"
self.dataset = core.Dataset("MultiSlotDataset")
self.thread_num = 1
self.filelist = []
def set_pipe_command(self, pipe_command):
"""
Set pipe command of current dataset
A pipe command is a UNIX pipeline command that can be used only
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_pipe_command("python my_script.py")
Args:
pipe_command(str): pipe command
"""
self.proto_desc.pipe_command = pipe_command
def set_rank_offset(self, rank_offset):
"""
Set rank_offset for merge_pv. It set the message of Pv.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_rank_offset("rank_offset")
Args:
rank_offset(str): rank_offset's name
"""
self.proto_desc.rank_offset = rank_offset
def set_fea_eval(self, record_candidate_size, fea_eval=True):
"""
set fea eval mode for slots shuffle to debug the importance level of
slots(features), fea_eval need to be set True for slots shuffle.
Args:
record_candidate_size(int): size of instances candidate to shuffle
one slot
fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
default is True.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_fea_eval(1000000, True)
"""
if fea_eval:
self.dataset.set_fea_eval(fea_eval, record_candidate_size)
self.fea_eval = fea_eval
def slots_shuffle(self, slots):
"""
Slots Shuffle
Slots Shuffle is a shuffle method in slots level, which is usually used
in sparse feature with large scale of instances. To compare the metric, i.e.
auc while doing slots shuffle on one or several slots with baseline to
evaluate the importance level of slots(features).
Args:
slots(list[string]): the set of slots(string) to do slots shuffle.
Examples:
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_merge_by_lineid()
#suppose there is a slot 0
dataset.slots_shuffle(['0'])
"""
if self.fea_eval:
slots_set = set(slots)
self.dataset.slots_shuffle(slots_set)
def set_batch_size(self, batch_size):
"""
Set batch size. Will be effective during training
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_batch_size(128)
Args:
batch_size(int): batch size
"""
self.proto_desc.batch_size = batch_size
def set_pv_batch_size(self, pv_batch_size):
"""
Set pv batch size. It will be effective during enable_pv_merge
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_pv_batch(128)
Args:
pv_batch_size(int): pv batch size
"""
self.proto_desc.pv_batch_size = pv_batch_size
def set_thread(self, thread_num):
"""
Set thread num, it is the num of readers.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_thread(12)
Args:
thread_num(int): thread num
"""
self.dataset.set_thread_num(thread_num)
self.thread_num = thread_num
def set_filelist(self, filelist):
"""
Set file list in current worker.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_filelist(['a.txt', 'b.txt'])
Args:
filelist(list): file list
"""
self.dataset.set_filelist(filelist)
self.filelist = filelist
def set_input_type(self, input_type):
self.proto_desc.input_type = input_type
def set_use_var(self, var_list):
"""
Set Variables which you will use.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var([data, label])
Args:
var_list(list): variable list
"""
multi_slot = self.proto_desc.multi_slot_desc
for var in var_list:
slot_var = multi_slot.slots.add()
slot_var.is_used = True
slot_var.name = var.name
if var.lod_level == 0:
slot_var.is_dense = True
slot_var.shape.extend(var.shape)
if var.dtype == core.VarDesc.VarType.FP32:
slot_var.type = "float"
elif var.dtype == core.VarDesc.VarType.INT64:
slot_var.type = "uint64"
else:
raise ValueError(
"Currently, fluid.dataset only supports dtype=float32 and dtype=int64"
)
def set_hdfs_config(self, fs_name, fs_ugi):
"""
Set hdfs config: fs name ad ugi
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
Args:
fs_name(str): fs name
fs_ugi(str): fs ugi
"""
self.dataset.set_hdfs_config(fs_name, fs_ugi)
def set_download_cmd(self, download_cmd):
"""
Set customized download cmd: download_cmd
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_download_cmd("./read_from_afs")
Args:
download_cmd(str): customized download command
"""
self.dataset.set_download_cmd(download_cmd)
def _prepare_to_run(self):
"""
Set data_feed_desc before load or shuffle,
user no need to call this function.
"""
if self.thread_num > len(self.filelist):
self.thread_num = len(self.filelist)
self.dataset.set_thread_num(self.thread_num)
self.dataset.set_data_feed_desc(self.desc())
self.dataset.create_readers()
def _finish_to_run(self):
self.dataset.destroy_readers()
def desc(self):
"""
Returns a protobuf message for this DataFeedDesc
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
print(dataset.desc())
Returns:
A string message
"""
return text_format.MessageToString(self.proto_desc)
def _dynamic_adjust_before_train(self, thread_num):
pass
def _dynamic_adjust_after_train(self):
pass
class InMemoryDataset(DatasetBase):
"""
InMemoryDataset, it will load data into memory
and shuffle data before training.
This class should be created by DatasetFactory
Example:
dataset = paddle.fluid.DatasetFactory().create_dataset("InMemoryDataset")
"""
def __init__(self):
""" Init. """
super(InMemoryDataset, self).__init__()
self.proto_desc.name = "MultiSlotInMemoryDataFeed"
self.fleet_send_batch_size = None
self.is_user_set_queue_num = False
self.queue_num = None
self.parse_ins_id = False
self.parse_content = False
self.parse_logkey = False
self.merge_by_sid = True
self.enable_pv_merge = False
self.merge_by_lineid = False
self.fleet_send_sleep_seconds = None
def set_feed_type(self, data_feed_type):
"""
Set data_feed_desc
"""
self.proto_desc.name = data_feed_type
def _prepare_to_run(self):
"""
Set data_feed_desc before load or shuffle,
user no need to call this function.
"""
if self.thread_num <= 0:
self.thread_num = 1
self.dataset.set_thread_num(self.thread_num)
if self.queue_num is None:
self.queue_num = self.thread_num
self.dataset.set_queue_num(self.queue_num)
self.dataset.set_parse_ins_id(self.parse_ins_id)
self.dataset.set_parse_content(self.parse_content)
self.dataset.set_parse_logkey(self.parse_logkey)
self.dataset.set_merge_by_sid(self.merge_by_sid)
self.dataset.set_enable_pv_merge(self.enable_pv_merge)
self.dataset.set_data_feed_desc(self.desc())
self.dataset.create_channel()
self.dataset.create_readers()
def _dynamic_adjust_before_train(self, thread_num):
if not self.is_user_set_queue_num:
self.dataset.dynamic_adjust_channel_num(thread_num, False)
self.dataset.dynamic_adjust_readers_num(thread_num)
def _dynamic_adjust_after_train(self):
if not self.is_user_set_queue_num:
self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
self.dataset.dynamic_adjust_readers_num(self.thread_num)
def set_queue_num(self, queue_num):
"""
Set Dataset output queue num, training threads get data from queues
Args:
queue_num(int): dataset output queue num
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_queue_num(12)
"""
self.is_user_set_queue_num = True
self.queue_num = queue_num
def set_parse_ins_id(self, parse_ins_id):
"""
Set id Dataset need to parse insid
Args:
parse_ins_id(bool): if parse ins_id or not
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_parse_ins_id(True)
"""
self.parse_ins_id = parse_ins_id
def set_parse_content(self, parse_content):
"""
Set if Dataset need to parse content
Args:
parse_content(bool): if parse content or not
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_parse_content(True)
"""
self.parse_content = parse_content
def set_parse_logkey(self, parse_logkey):
"""
Set if Dataset need to parse logkey
Args:
parse_content(bool): if parse logkey or not
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_parse_logkey(True)
"""
self.parse_logkey = parse_logkey
def set_merge_by_sid(self, merge_by_sid):
"""
Set if Dataset need to merge sid. If not, one ins means one Pv.
Args:
merge_by_sid(bool): if merge sid or not
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_merge_by_sid(True)
"""
self.merge_by_sid = merge_by_sid
def set_enable_pv_merge(self, enable_pv_merge):
"""
Set if Dataset need to merge pv.
Args:
enable_pv_merge(bool): if enable_pv_merge or not
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_enable_pv_merge(True)
"""
self.enable_pv_merge = enable_pv_merge
def preprocess_instance(self):
"""
Merge pv instance and convey it from input_channel to input_pv_channel.
It will be effective when enable_pv_merge_ is True.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.preprocess_instance()
"""
self.dataset.preprocess_instance()
def set_current_phase(self, current_phase):
"""
Set current phase in train. It is useful for untest.
current_phase : 1 for join, 0 for update.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.set_current_phase(1)
"""
self.dataset.set_current_phase(current_phase)
def postprocess_instance(self):
"""
Divide pv instance and convey it to input_channel.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.preprocess_instance()
exe.train_from_dataset(dataset)
dataset.postprocess_instance()
"""
self.dataset.postprocess_instance()
def set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
"""
Set fleet send batch size, default is 1024
Args:
fleet_send_batch_size(int): fleet send batch size
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_fleet_send_batch_size(800)
"""
self.fleet_send_batch_size = fleet_send_batch_size
def set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
"""
Set fleet send sleep time, default is 0
Args:
fleet_send_sleep_seconds(int): fleet send sleep time
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_fleet_send_sleep_seconds(2)
"""
self.fleet_send_sleep_seconds = fleet_send_sleep_seconds
def set_merge_by_lineid(self, merge_size=2):
"""
Set merge by line id, instances of same line id will be merged after
shuffle, you should parse line id in data generator.
Args:
merge_size(int): ins size to merge. default is 2.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_merge_by_lineid()
"""
self.dataset.set_merge_by_lineid(merge_size)
self.merge_by_lineid = True
self.parse_ins_id = True
def set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
self.dataset.set_generate_unique_feasigns(generate_uni_feasigns)
self.gen_uni_feasigns = generate_uni_feasigns
self.local_shard_num = shard_num
def generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num,
consume_thread_num, shard_num):
self.dataset.generate_local_tables_unlock(
table_id, fea_dim, read_thread_num, consume_thread_num, shard_num)
def load_into_memory(self):
"""
Load data into memory
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
"""
self._prepare_to_run()
self.dataset.load_into_memory()
def preload_into_memory(self, thread_num=None):
"""
Load data into memory in async mode
Args:
thread_num(int): preload thread num
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.preload_into_memory()
dataset.wait_preload_done()
"""
self._prepare_to_run()
if thread_num is None:
thread_num = self.thread_num
self.dataset.set_preload_thread_num(thread_num)
self.dataset.create_preload_readers()
self.dataset.preload_into_memory()
def wait_preload_done(self):
"""
Wait preload_into_memory done
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.preload_into_memory()
dataset.wait_preload_done()
"""
self.dataset.wait_preload_done()
self.dataset.destroy_preload_readers()
def local_shuffle(self):
"""
Local shuffle
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.local_shuffle()
"""
self.dataset.local_shuffle()
def global_shuffle(self, fleet=None, thread_num=12):
"""
Global shuffle.
Global shuffle can be used only in distributed mode. i.e. multiple
processes on single machine or multiple machines training together.
If you run in distributed mode, you should pass fleet instead of None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.global_shuffle(fleet)
Args:
fleet(Fleet): fleet singleton. Default None.
thread_num(int): shuffle thread num. Default is 12.
"""
trainer_num = 1
if fleet is not None:
fleet._role_maker.barrier_worker()
trainer_num = fleet.worker_num()
if self.fleet_send_batch_size is None:
self.fleet_send_batch_size = 1024
if self.fleet_send_sleep_seconds is None:
self.fleet_send_sleep_seconds = 0
self.dataset.register_client2client_msg_handler()
self.dataset.set_trainer_num(trainer_num)
self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size)
self.dataset.set_fleet_send_sleep_seconds(self.fleet_send_sleep_seconds)
if fleet is not None:
fleet._role_maker.barrier_worker()
self.dataset.global_shuffle(thread_num)
if fleet is not None:
fleet._role_maker.barrier_worker()
if self.merge_by_lineid:
self.dataset.merge_by_lineid()
if fleet is not None:
fleet._role_maker.barrier_worker()
def release_memory(self):
"""
:api_attr: Static Graph
Release InMemoryDataset memory data, when data will not be used again.
Examples:
.. code-block:: python
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.global_shuffle(fleet)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
exe.train_from_dataset(fluid.default_main_program(), dataset)
dataset.release_memory()
"""
self.dataset.release_memory()
def get_pv_data_size(self):
"""
Get memory data size of Pv, user can call this function to know the pv num
of ins in all workers after load into memory.
Note:
This function may cause bad performance, because it has barrier
Returns:
The size of memory pv data.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
print dataset.get_pv_data_size()
"""
return self.dataset.get_pv_data_size()
def get_memory_data_size(self, fleet=None):
"""
Get memory data size, user can call this function to know the num
of ins in all workers after load into memory.
Note:
This function may cause bad performance, because it has barrier
Args:
fleet(Fleet): Fleet Object.
Returns:
The size of memory data.
Examples:
.. code-block:: python
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
print dataset.get_memory_data_size(fleet)
"""
import numpy as np
local_data_size = self.dataset.get_memory_data_size()
local_data_size = np.array([local_data_size])
if fleet is not None:
global_data_size = local_data_size * 0
fleet._role_maker.all_reduce_worker(local_data_size,
global_data_size)
return global_data_size[0]
return local_data_size[0]
def get_shuffle_data_size(self, fleet=None):
"""
Get shuffle data size, user can call this function to know the num
of ins in all workers after local/global shuffle.
Note:
This function may cause bad performance to local shuffle,
because it has barrier. It does not affect global shuffle.
Args:
fleet(Fleet): Fleet Object.
Returns:
The size of shuffle data.
Examples:
.. code-block:: python
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.global_shuffle(fleet)
print dataset.get_shuffle_data_size(fleet)
"""
import numpy as np
local_data_size = self.dataset.get_shuffle_data_size()
local_data_size = np.array([local_data_size])
if fleet is not None:
global_data_size = local_data_size * 0
fleet._role_maker.all_reduce_worker(local_data_size,
global_data_size)
return global_data_size[0]
return local_data_size[0]
class QueueDataset(DatasetBase):
"""
QueueDataset, it will process data streamly.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
"""
def __init__(self):
"""
Initialize QueueDataset
This class should be created by DatasetFactory
"""
super(QueueDataset, self).__init__()
self.proto_desc.name = "MultiSlotDataFeed"
def _prepare_to_run(self):
"""
Set data_feed_desc/thread num/filelist before run,
user no need to call this function.
"""
if self.thread_num > len(self.filelist):
self.thread_num = len(self.filelist)
if self.thread_num == 0:
self.thread_num = 1
self.dataset.set_thread_num(self.thread_num)
self.dataset.set_filelist(self.filelist)
self.dataset.set_data_feed_desc(self.desc())
self.dataset.create_readers()
def local_shuffle(self):
"""
Local shuffle data.
Local shuffle is not supported in QueueDataset
NotImplementedError will be raised
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset.local_shuffle()
Raises:
NotImplementedError: QueueDataset does not support local shuffle
"""
raise NotImplementedError(
"QueueDataset does not support local shuffle, "
"please use InMemoryDataset for local_shuffle")
def global_shuffle(self, fleet=None):
"""
Global shuffle data.
Global shuffle is not supported in QueueDataset
NotImplementedError will be raised
Args:
fleet(Fleet): fleet singleton. Default None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset.global_shuffle(fleet)
Raises:
NotImplementedError: QueueDataset does not support global shuffle
"""
raise NotImplementedError(
"QueueDataset does not support global shuffle, "
"please use InMemoryDataset for global_shuffle")
class FileInstantDataset(DatasetBase):
"""
FileInstantDataset, it will process data streamly.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory.create_dataset("FileInstantDataset")
"""
def __init__(self):
"""
Initialize FileInstantDataset
This class should be created by DatasetFactory
"""
super(FileInstantDataset, self).__init__()
self.proto_desc.name = "MultiSlotFileInstantDataFeed"
def local_shuffle(self):
"""
Local shuffle
FileInstantDataset does not support local shuffle
"""
raise NotImplementedError(
"FileInstantDataset does not support local shuffle, "
"please use InMemoryDataset for local_shuffle")
def global_shuffle(self, fleet=None):
"""
Global shuffle
FileInstantDataset does not support global shuffle
"""
raise NotImplementedError(
"FileInstantDataset does not support global shuffle, "
"please use InMemoryDataset for global_shuffle")
class BoxPSDataset(InMemoryDataset):
"""
BoxPSDataset: derived from InMemoryDataset.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
"""
def __init__(self):
"""
Initialize BoxPSDataset
This class should be created by DatasetFactory
"""
super(BoxPSDataset, self).__init__()
self.boxps = core.BoxPS(self.dataset)
self.proto_desc.name = "PaddleBoxDataFeed"
def set_date(self, date):
"""
Workaround for date
"""
year = int(date[:4])
month = int(date[4:6])
day = int(date[6:])
self.boxps.set_date(year, month, day)
def begin_pass(self):
"""
Begin Pass
Notify BoxPS to load sparse parameters of next pass to GPU Memory
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
dataset.begin_pass()
"""
self.boxps.begin_pass()
def end_pass(self, need_save_delta):
"""
End Pass
Notify BoxPS that current pass ended
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
dataset.end_pass(True)
"""
self.boxps.end_pass(need_save_delta)
def wait_preload_done(self):
"""
Wait async preload done
Wait Until Feed Pass Done
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.preload_into_memory()
dataset.wait_preload_done()
"""
self.boxps.wait_feed_pass_done()
def load_into_memory(self):
"""
Load next pass into memory and notify boxps to fetch its emb from SSD
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
"""
self._prepare_to_run()
self.boxps.load_into_memory()
def preload_into_memory(self):
"""
Begin async preload next pass while current pass may be training
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.preload_into_memory()
"""
self._prepare_to_run()
self.boxps.preload_into_memory()
def _dynamic_adjust_before_train(self, thread_num):
if not self.is_user_set_queue_num:
self.dataset.dynamic_adjust_channel_num(thread_num, True)
self.dataset.dynamic_adjust_readers_num(thread_num)
def _dynamic_adjust_after_train(self):
pass
def slots_shuffle(self, slots):
"""
Slots Shuffle
Slots Shuffle is a shuffle method in slots level, which is usually used
in sparse feature with large scale of instances. To compare the metric, i.e.
auc while doing slots shuffle on one or several slots with baseline to
evaluate the importance level of slots(features).
Args:
slots(list[string]): the set of slots(string) to do slots shuffle.
Examples:
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_merge_by_lineid()
#suppose there is a slot 0
dataset.slots_shuffle(['0'])
"""
slots_set = set(slots)
self.boxps.slots_shuffle(slots_set)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import functools
import sys
__all__ = ['deprecated']
def deprecated(since, instead, extra_message=""):
def decorator(func):
err_msg = "API {0} is deprecated since {1}. Please use {2} instead.".format(
func.__name__, since, instead)
if len(extra_message) != 0:
err_msg += "\n"
err_msg += extra_message
@functools.wraps(func)
def wrapper(*args, **kwargs):
print(err_msg, file=sys.stderr)
return func(*args, **kwargs)
wrapper.__doc__ += "\n "
wrapper.__doc__ += err_msg
return wrapper
return decorator
......@@ -223,7 +223,8 @@ class DownpourSGD(DeviceWorker):
dense_table_set.add(i)
break
trainer_desc.device_worker_name = "DownpourWorker"
trainer_desc.device_worker_name = opt_info.get("worker_class",
"DownpourWorker")
pull_thread = trainer_desc.pull_dense_param
pull_thread.device_num = trainer_desc.thread_num
if opt_info.get("program_id_to_worker") is None:
......
......@@ -36,7 +36,6 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
from paddle.fluid.dygraph.base import param_guard
from paddle.fluid.data_feeder import check_type
from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
from paddle.fluid.annotations import deprecated
__all__ = ['ProgramTranslator', 'convert_to_static']
......
......@@ -35,7 +35,7 @@ __all__ = [
'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding',
'GRUUnit', 'InstanceNorm', 'LayerNorm', 'NCE', 'PRelu',
'BilinearTensorProduct', 'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm',
'SpectralNorm', 'TreeConv'
'SpectralNorm', 'TreeConv', 'Flatten'
]
......@@ -3182,3 +3182,62 @@ class TreeConv(layers.Layer):
else:
pre_activation = out
return self._helper.append_activation(pre_activation, act=self._act)
class Flatten(layers.Layer):
"""
:alias_main: paddle.nn.Flatten
:alias: paddle.nn.Flatten,paddle.nn.layer.Flatten,paddle.nn.layer.common.Flatten
This interface is used to construct a callable object of the ``FLatten`` class.
For more details, refer to code examples.
It implements flatten a contiguous range of dims into a tensor.
Equation:
Parameters:
start_axis(int): first dim to flatten (default = 1)
stop_axis(int): last dim to flatten (default = -1).
Returns:
None
Examples:
.. code-block:: python
import paddle
from paddle.imperative import to_variable
import numpy as np
inp_np = np.ones([5, 2, 3, 4]).astype('float32')
paddle.enable_imperative()
inp_np = to_variable(inp_np)
flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
flatten_res = flatten(inp_np)
"""
def __init__(self, start_axis=1, stop_axis=-1):
super(Flatten, self).__init__()
self.start_axis = start_axis
self.stop_axis = stop_axis
def forward(self, input):
out = self._helper.create_variable_for_type_inference(input.dtype)
x_shape = self._helper.create_variable_for_type_inference(input.dtype)
if in_dygraph_mode():
dy_out, _ = core.ops.flatten_contiguous_range(
input, 'start_axis', self.start_axis, 'stop_axis',
self.stop_axis)
return dy_out
self._helper.append_op(
type="flatten_contiguous_range",
inputs={"X": input},
outputs={"Out": out,
"XShape": x_shape},
attrs={"start_axis": self.start_axis,
"stop_axis": self.stop_axis})
return out
......@@ -1300,6 +1300,12 @@ class Executor(object):
fetch_list=None,
fetch_info=None,
print_period=100):
is_heter = 0
if not program._fleet_opt is None:
if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker":
is_heter = 1
if program._fleet_opt("trainer", "") == "HeterXpuTrainer":
is_heter = 1
if scope is None:
scope = global_scope()
if fetch_list is None:
......@@ -1308,6 +1314,11 @@ class Executor(object):
fetch_info = []
assert len(fetch_list) == len(fetch_info)
compiled = isinstance(program, compiler.CompiledProgram)
if is_heter:
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
fu = FleetUtil()
ret = fu.split_program_by_device(program)
if not compiled:
# TODO: Need a better way to distinguish and specify different execution mode
if program._pipeline_opt:
......@@ -1317,6 +1328,8 @@ class Executor(object):
trainer = TrainerFactory()._create_trainer(program._fleet_opt)
trainer._set_thread_barrier(program._is_distributed)
trainer._set_program(program)
if is_heter:
trainer._set_heter_info(ret)
else:
if program._pipeline_opt:
trainer = TrainerFactory()._create_trainer(
......@@ -1476,6 +1489,60 @@ class Executor(object):
debug, fetch_list, fetch_info,
print_period, fetch_handler)
def start_heter_trainer(self,
program=None,
scope=None,
debug=False,
fetch_list=None,
fetch_info=None,
print_period=100,
fetch_handler=None):
return self._start_heter_trainer(program, scope, False, debug,
fetch_list, fetch_info, print_period,
fetch_handler)
def _start_heter_trainer(self,
program=None,
scope=None,
is_infer=False,
debug=False,
fetch_list=None,
fetch_info=None,
print_period=100,
fetch_handler=None):
scope, trainer = self._prepare_trainer(
program=program,
dataset=None,
scope=scope,
thread=1,
debug=debug,
fetch_list=fetch_list,
fetch_info=fetch_info,
print_period=print_period)
trainer._set_infer(is_infer)
trainer._gen_trainer_desc()
self._dump_debug_info(program=program, trainer=trainer)
trainer_instance = self._default_executor.init_for_dataset(
program.desc, trainer._desc(), scope, None)
#if fetch_handler is not None:
# scope0 = trainer_instance.get_worker_scope(0)
# fetch_monitor = FetchHandlerMonitor(scope0, fetch_handler)
# fetch_monitor.start()
# self._default_executor.run_from_dataset(trainer_instance)
# fetch_monitor.stop()
# self._default_executor.release_trainer(trainer_instance)
#else:
self._default_executor.run_from_dataset(trainer_instance)
#self._default_executor.release_trainer(trainer_instance)
return trainer_instance
def train_from_dataset(self,
program=None,
dataset=None,
......
......@@ -149,6 +149,16 @@ class Fleet(object):
"""
return self._role_maker.is_server()
def is_xpu(self):
"""
Check whether the node is an instance of server.
Returns:
bool: True if this is a node of server,
False if not.
"""
return self._role_maker.is_xpu()
def split_files(self, files):
"""
split files before distributed training,
......
......@@ -28,6 +28,7 @@ __all__ = [
class Role:
WORKER = 1
SERVER = 2
XPU = 3
class MockBarrier(object):
......@@ -988,6 +989,147 @@ class GeneralRoleMaker(RoleMakerBase):
http_server.stop()
class HeterRoleMaker(GeneralRoleMaker):
"""
This role maker is for general use, you can set os.environ to customize:
PADDLE_PSERVERS_IP_PORT_LIST : all pservers' ip:port, separated by ','
PADDLE_TRAINER_ENDPOINTS : all trainers' ip:port, separated by ','
TRAINING_ROLE : TRAINER or PSERVER
PADDLE_TRAINER_ID : current trainer id (only for trainer),
it is index in PADDLE_TRAINER_ENDPOINTS
PADDLE_PSERVER_ID : current pserver id (only for pserver)
it is index in PADDLE_PSERVERS_IP_PORT_LIST
"""
def generate_role(self):
"""
generate role for general role maker
"""
if not self._role_is_generated:
eplist = os.environ["PADDLE_PSERVERS_IP_PORT_LIST"].split(",")
training_role = os.environ["TRAINING_ROLE"]
worker_endpoints = os.environ["PADDLE_TRAINER_ENDPOINTS"].split(",")
trainers_num = len(worker_endpoints)
xpu_endpoints = os.environ["PADDLE_XPU_ENDPOINTS"].split(",")
xpu_num = len(xpu_endpoints)
if training_role not in ["TRAINER", "PSERVER", "XPU"]:
raise ValueError(
"TRAINING_ROLE must be PSERVER or TRAINER or XPU")
if training_role == "TRAINER":
role = Role.WORKER
current_id = int(os.environ["PADDLE_TRAINER_ID"])
self._node_type = 1
self._cur_endpoint = worker_endpoints[current_id]
gloo = fluid.core.Gloo()
gloo.init(current_id,
len(worker_endpoints),
self._hdfs_path.rstrip("/") + "/trainer",
self._hdfs_name, self._hdfs_ugi, self._iface,
self._prefix)
self._node_type_comm = gloo
elif training_role == "XPU":
role = Role.XPU
current_id = int(os.environ["PADDLE_XPU_ID"])
self._node_type = 2
self._cur_endpoint = xpu_endpoints[current_id]
gloo = fluid.core.Gloo()
gloo.init(current_id,
len(xpu_endpoints),
self._hdfs_path.rstrip("/") + "/xpu", self._hdfs_name,
self._hdfs_ugi, self._iface, self._prefix)
self._node_type_comm = gloo
elif training_role == "PSERVER":
role = Role.SERVER
if os.environ.get("PADDLE_PSERVER_ID") is not None:
current_id = int(os.environ["PADDLE_PSERVER_ID"])
cur_endpoint = eplist[current_id]
else:
# this is for compatible with paddlecloud
cur_ip = os.environ["POD_IP"]
cur_port = os.environ["PADDLE_PORT"]
cur_endpoint = ":".join([cur_ip, cur_port])
current_id = eplist.index(cur_endpoint)
self._node_type = 0
self._cur_endpoint = cur_endpoint
gloo = fluid.core.Gloo()
gloo.init(current_id,
len(eplist),
self._hdfs_path.rstrip("/") + "/pserver",
self._hdfs_name, self._hdfs_ugi, self._iface,
self._prefix)
self._node_type_comm = gloo
if training_role == "TRAINER" or training_role == "XPU":
gloo = fluid.core.Gloo()
heter_list = worker_endpoints + xpu_endpoints
gloo.init(
heter_list.index(self._cur_endpoint),
len(heter_list),
self._hdfs_path.rstrip("/") + "/heter", self._hdfs_name,
self._hdfs_ugi, self._iface, self._prefix)
self._heter_comm = gloo
gloo = fluid.core.Gloo()
all_list = worker_endpoints + eplist + xpu_endpoints
gloo.init(
all_list.index(self._cur_endpoint),
len(all_list),
self._hdfs_path.rstrip("/") + "/all", self._hdfs_name,
self._hdfs_ugi, self._iface, self._prefix)
self._all_comm = gloo
self._trainers_num = trainers_num
self._server_endpoints = eplist
self._role = role
self._current_id = current_id
self._rank = all_list.index(self._cur_endpoint)
self._size = len(all_list)
self._worker_endpoints = worker_endpoints
self._xpu_endpoints = xpu_endpoints
self._role_is_generated = True
def is_xpu(self):
"""
whether current process is server
"""
if not self._role_is_generated:
self.generate_role()
return self._role == Role.XPU
def is_first_xpu(self):
"""
whether current process is worker of rank 0
"""
if not self._role_is_generated:
self.generate_role()
return self._role == Role.XPU and self._current_id == 0
def _barrier_xpu(self):
"""
barrier all workers in current distributed job
"""
if not self._role_is_generated:
self.generate_role()
if self.is_xpu():
self._node_type_comm.barrier()
def _barrier_heter(self):
"""
barrier all workers in current distributed job
"""
if not self._role_is_generated:
self.generate_role()
if self.is_xpu() or self.is_worker:
self._heter_comm.barrier()
def xpu_num(self):
"""
"""
if not self._role_is_generated:
self.generate_role()
return len(self._xpu_endpoints)
class UserDefinedRoleMaker(RoleMakerBase):
"""
UserDefinedRoleMaker is designed for worker and server assignment
......
......@@ -23,6 +23,7 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
from paddle.fluid.incubate.fleet.base.mode import Mode
from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
from paddle.fluid.incubate.fleet.base.role_maker import HeterRoleMaker
class PSLib(Fleet):
......@@ -44,6 +45,9 @@ class PSLib(Fleet):
role_maker = MPISymetricRoleMaker()
super(PSLib, self).init(role_maker)
self._fleet_ptr = fluid.core.Fleet()
self._heter_ptr = None
if isinstance(role_maker, HeterRoleMaker):
self._heter_ptr = fluid.core.Heter()
def _set_client_communication_config(self, request_timeout_ms,
connect_timeout_ms, max_retry):
......@@ -77,15 +81,27 @@ class PSLib(Fleet):
raise Exception(
"You should run DistributedOptimizer.minimize() first")
# barrier_all for init_server, wait for server starts
if isinstance(self._role_maker, HeterRoleMaker):
if self._role_maker.is_xpu():
local_endpoint = self._role_maker.get_local_endpoint()
local_endpoint = local_endpoint.split(":")
self._heter_ptr.start_xpu_service(
str(local_endpoint[0]), int(local_endpoint[1]))
self._role_maker._barrier_all()
self.all_ips_ = self._role_maker._all_gather(self._local_ip)
# worker_index * 2 is for compatible with older versions of pslib
self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_,
self._role_maker._get_size(),
self._role_maker.worker_index() * 2)
if isinstance(self._role_maker, HeterRoleMaker):
if self._role_maker.is_worker():
self._heter_ptr.set_xpu_list(
self._role_maker._xpu_endpoints)
self._heter_ptr.create_client2xpu_connection()
# barrier_all for init_worker
self._role_maker._barrier_all()
# prepare for client to client communication
if self._role_maker.is_worker():
info = self._fleet_ptr.get_clients_info()
all_info = self._role_maker._worker_gather(info[0])
self._fleet_ptr.gather_clients(all_info)
......@@ -144,6 +160,12 @@ class PSLib(Fleet):
>>> fleet.init_server("/you/path/to/model", mode = 0)
"""
mode = kwargs.get("mode", 0)
if isinstance(self._role_maker, HeterRoleMaker):
self._role_maker._barrier_xpu()
if self._role_maker.is_first_xpu():
self._fleet_ptr.load_model(model_dir, mode)
self._role_maker._barrier_xpu()
else:
self._role_maker._barrier_worker()
if self._role_maker.is_first_worker():
self._fleet_ptr.load_model(model_dir, mode)
......@@ -185,6 +207,54 @@ class PSLib(Fleet):
raise Exception(
"You should run DistributedOptimizer.minimize() first")
def end_pass(self, scope):
if self._role_maker.worker_index() < self._role_maker.xpu_num():
self._heter_ptr.end_pass(scope, self._role_maker.worker_index())
self._heter_ptr.stop_xpu_service(self._role_maker.worker_index())
def train_from_dataset(self,
executor,
program=None,
dataset=None,
scope=None,
thread=0,
debug=False,
fetch_list=None,
fetch_info=None,
print_period=100,
fetch_handler=None):
"""
"""
if self._role_maker.is_worker():
self._role_maker._barrier_heter()
executor.train_from_dataset(program, dataset, scope, thread, debug,
fetch_list, fetch_info, print_period,
fetch_handler)
def start_heter_trainer(self,
executor,
program=None,
scope=None,
debug=False,
fetch_list=None,
fetch_info=None,
print_period=100,
fetch_handler=None):
"""
"""
trainer_instance = executor.start_heter_trainer(
program, scope, debug, fetch_list, fetch_info, print_period,
fetch_handler)
if self._role_maker.is_xpu():
print("barrier heter")
self._role_maker._barrier_heter()
print("barrier heter")
executor._default_executor.release_trainer(trainer_instance)
def stop_worker(self):
"""
stop(): will be called after a user finishes his/her training task. Fleet instance will be
......@@ -197,6 +267,7 @@ class PSLib(Fleet):
self._role_maker._barrier_worker()
if self._role_maker.is_first_worker():
self._fleet_ptr.stop_server()
self._heter_ptr.stop_xpu_service()
self._role_maker._barrier_worker()
self._role_maker._barrier_all()
self._role_maker._finalize()
......
......@@ -509,13 +509,15 @@ class DistributedAdam(DistributedOptimizerImplBase):
opt_info = {}
opt_info["program_id_to_worker"] = prog_id_to_worker
opt_info["program_configs"] = program_configs
opt_info["trainer"] = "DistMultiTrainer"
opt_info["trainer"] = strategy.get("trainer", "DistMultiTrainer")
opt_info["device_worker"] = strategy.get("device_worker", "DownpourSGD")
opt_info["optimizer"] = "DownpourSGD"
opt_info["fleet_desc"] = ps_param
opt_info["worker_skipped_ops"] = worker_skipped_ops
opt_info["use_cvm"] = strategy.get("use_cvm", False)
opt_info["no_cvm"] = strategy.get("no_cvm", False)
opt_info["worker_class"] = strategy.get("worker_class",
"DownpourWorker")
opt_info["stat_var_names"] = strategy.get("stat_var_names", [])
opt_info["local_tables"] = strategy.get("local_tables", [])
opt_info["async_tables"] = strategy.get("async_tables", [])
......@@ -529,6 +531,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
opt_info["dump_file_num"] = strategy.get("dump_file_num", 16)
opt_info["dump_fields_path"] = strategy.get("dump_fields_path", "")
opt_info["dump_param"] = strategy.get("dump_param", [])
opt_info["worker_places"] = strategy.get("worker_places", [])
if server._server.downpour_server_param.downpour_table_param[
0].accessor.accessor_class in [
"DownpourCtrAccessor", "DownpourCtrDoubleAccessor",
......
......@@ -14,6 +14,7 @@
"""Fleet Utils."""
import collections
import copy
import json
import logging
import math
......@@ -1615,3 +1616,123 @@ class FleetUtil(object):
"""
program = utils.load_program(prog_path, is_text)
utils.parse_program(program, output_dir)
def split_program_by_device(self, program):
ops_list = []
type_list = []
pre = None
type_cpu = "cpu"
for op in program.global_block().ops:
if op.has_attr("op_device"):
if pre is None or pre != op.attr("op_device"):
ops_list.append([])
type_list.append(
op.attr("op_device")
if op.attr("op_device") != "" else type_cpu)
ops_list[-1].append(op)
pre = op.attr("op_device")
l = len(type_list)
i = 0
type_heter = None
while i < l:
while i < l and type_list[i] == type_cpu:
i += 1
if i == l:
break
type_heter = type_list[i]
i += 1
start = i
valid = True
while i < l and type_list[i] != type_heter:
if type_list[i] != type_cpu:
valid = False
break
i += 1
if i == l:
break
elif not valid:
continue
for j in range(start, i):
for op in ops_list[j]:
op._set_attr("op_device", type_heter)
type_list[j] = type_heter
j += 1
pre = None
merged_ops_list = []
merged_type_list = []
for i in range(l):
if pre is None or pre != type_list[i]:
merged_ops_list.append([])
merged_type_list.append(type_list[i])
merged_ops_list[-1].extend(ops_list[i])
pre = type_list[i]
data_vars = set()
for k in program.global_block().vars:
var = program.global_block().var(k)
if not var.persistable:
data_vars.add(var.name)
l = len(merged_ops_list)
inputs_pre = set()
outputs_pre = set()
in_from_pre = [[] for i in range(l)]
for i in range(l):
inputs = set()
outputs = set()
for op in merged_ops_list[i]:
for input in op.input_names:
for tmp in op.input(input):
if tmp not in outputs:
inputs.add(tmp)
for output in op.output_names:
for tmp in op.output(output):
outputs.add(tmp)
if i == 0:
in_from_pre[i] = []
elif i == 1:
in_from_pre[i] = (outputs_pre | data_vars) & inputs
else:
in_from_pre[i] = outputs_pre & inputs
inputs_pre = copy.deepcopy(inputs)
outputs_pre = copy.deepcopy(outputs)
l = len(in_from_pre)
start_list = []
end_list = []
send_list = [[] for i in range(l)]
sum = 0
program_list = []
for i in range(l):
start_list.append(sum)
end_list.append(sum + len(merged_ops_list[i]) - 1)
sum += len(merged_ops_list[i])
if i < l - 1:
send_list[i].extend(list(in_from_pre[i + 1]))
prog = program.clone()
if merged_type_list[i] != type_cpu:
prog = prog._prune_with_input(
list(in_from_pre[i]), list(send_list[i]))
program_list.append(prog)
else:
program_list.append(prog)
recv_list = [list(i) for i in in_from_pre]
found = False
heter_index = None
for i in range(len(merged_type_list)):
t = merged_type_list[i]
if t != type_cpu:
if found:
print("only one region of program can be heter")
found = True
heter_index = i
if heter_index is None:
print("warning: non heter program")
return None
else:
return [start_list[heter_index], end_list[heter_index], send_list[heter_index], \
recv_list[heter_index], program_list[heter_index]]
......@@ -20,12 +20,12 @@ from __future__ import print_function
from .layer_function_generator import autodoc
from ..framework import unique_name
from ..layer_helper import LayerHelper
from ..annotations import deprecated
from paddle.utils import deprecated
__all__ = []
@deprecated(since='0.15.0', instead="ParallelExecutor")
@deprecated(since='0.15.0', update_to="paddle.fluid.ParallelExecutor")
@autodoc()
def get_places(device_count=None, device_type=None):
helper = LayerHelper('get_places', **locals())
......
......@@ -37,6 +37,7 @@ from functools import reduce
from .. import core
from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
import paddle
from paddle.utils import deprecated
__all__ = [
'fc',
......@@ -9910,7 +9911,7 @@ def flatten(x, axis=1, name=None):
return out
def stack(x, axis=0):
def stack(x, axis=0, name=None):
"""
This OP stacks all the inputs :code:`x` along axis.
......@@ -9990,15 +9991,16 @@ def stack(x, axis=0):
data = layers.stack(x1) # stack according to axis 0, data.shape=[1, None, 1, 2]
"""
helper = LayerHelper('stack', **locals())
axis = 0 if axis is None else axis
if not isinstance(x, list) and not isinstance(x, tuple):
x = [x]
if in_dygraph_mode():
return core.ops.stack(x, 'axis', axis)
helper = LayerHelper('stack', **locals())
out = helper.create_variable_for_type_inference(x[0].dtype)
if not in_dygraph_mode() and \
x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
"number of the elements must be 1, but received %s." % len(x)
out_index = helper.create_variable_for_type_inference(dtype="int32")
......@@ -11614,6 +11616,7 @@ Examples:
return _elementwise_op(LayerHelper('elementwise_sub', **locals()))
@deprecated(since="2.0.0", update_to="paddle.multiply")
def elementwise_mul(x, y, axis=-1, act=None, name=None):
"""
:alias_main: paddle.elementwise_mul
......
......@@ -28,7 +28,6 @@ from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_bu
from .unique_name import UniqueNameGenerator
import logging
import warnings
from .dataset import DatasetBase, InMemoryDataset
### Dygraph DataLoader configs ###
import os
......@@ -1670,7 +1669,7 @@ class PyReader(DataLoaderBase):
class DatasetLoader(DataLoaderBase):
def __init__(self, dataset, places, drop_last):
assert isinstance(dataset,
assert isinstance(dataset, paddle.fleet.dataset.
DatasetBase), "dataset must be type of DatasetBase"
assert not in_dygraph_mode(
), "DatasetLoader is not supported in dygraph mode yet"
......@@ -1686,7 +1685,7 @@ class DatasetLoader(DataLoaderBase):
dataset.set_thread(thread_num)
if isinstance(dataset,
if isinstance(dataset, paddle.fleet.dataset.
InMemoryDataset) and dataset.queue_num > thread_num:
logging.warn("queue_num {} which is set in Dataset is ignored".
format(dataset.queue_num))
......
......@@ -210,7 +210,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
filelist.append(train_file_path)
# config dataset
dataset = fluid.DatasetFactory().create_dataset()
dataset = paddle.fleet.DatasetFactory().create_dataset()
dataset.set_batch_size(batch_size)
dataset.set_use_var(self.feeds)
pipe_command = 'python ctr_dataset_reader.py'
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from paddle.fluid.tests.unittests.test_fusion_gru_op import TestFusionGRUOp
class TestFusionGRUMKLDNNOp(TestFusionGRUOp):
def set_confs(self):
self.use_mkldnn = True
class TestFusionGRUMKLDNNOpNoInitial(TestFusionGRUOp):
def set_confs(self):
self.with_h0 = False
self.use_mkldnn = True
class TestFusionGRUMKLDNNOpNoBias(TestFusionGRUOp):
def set_confs(self):
self.with_bias = False
self.use_mkldnn = True
class TestFusionGRUMKLDNNOpReverse(TestFusionGRUOp):
def set_confs(self):
self.is_reverse = True
self.use_mkldnn = True
class TestFusionGRUMKLDNNOpOriginMode(TestFusionGRUOp):
def set_confs(self):
self.origin_mode = True
self.use_mkldnn = True
class TestFusionGRUMKLDNNOpMD1(TestFusionGRUOp):
def set_confs(self):
self.M = 36
self.D = 8
self.use_mkldnn = True
class TestFusionGRUMKLDNNOpMD2(TestFusionGRUOp):
def set_confs(self):
self.M = 8
self.D = 8
self.use_mkldnn = True
class TestFusionGRUMKLDNNOpMD3(TestFusionGRUOp):
def set_confs(self):
self.M = 17
self.D = 15
self.use_mkldnn = True
class TestFusionGRUMKLDNNOpBS1(TestFusionGRUOp):
def set_confs(self):
self.lod = [[3]]
self.D = 16
self.use_mkldnn = True
if __name__ == "__main__":
unittest.main()
......@@ -17,6 +17,7 @@ including create, config, run, etc.
"""
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.compat as cpt
import paddle.fluid.core as core
......@@ -37,23 +38,26 @@ class TestDataset(unittest.TestCase):
def test_dataset_create(self):
""" Testcase for dataset create. """
try:
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
except:
self.assertTrue(False)
try:
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"QueueDataset")
except:
self.assertTrue(False)
try:
dataset = fluid.DatasetFactory().create_dataset(
dataset = paddle.fleet.DatasetFactory().create_dataset(
"FileInstantDataset")
except:
self.assertTrue(False)
try:
dataset = fluid.DatasetFactory().create_dataset("MyOwnDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"MyOwnDataset")
self.assertTrue(False)
except:
self.assertTrue(True)
......@@ -91,7 +95,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist(
......@@ -125,7 +130,7 @@ class TestDataset(unittest.TestCase):
dataset.set_trainer_num(4)
dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
dataset.set_download_cmd("./read_from_afs my_fs_name my_fs_ugi")
dataset.enable_pv_merge()
dataset.set_enable_pv_merge(False)
thread_num = dataset.get_thread_num()
self.assertEqual(thread_num, 12)
......@@ -171,7 +176,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([filename1, filename2])
......@@ -222,7 +228,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([
......@@ -293,7 +300,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="float32", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(1)
dataset.set_parse_ins_id(True)
......@@ -359,7 +367,8 @@ class TestDataset(unittest.TestCase):
name="slot4", shape=[1], dtype="float32", lod_level=0)
slots_vars = [var1, var2, var3, var4]
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(1)
dataset.set_parse_ins_id(True)
......@@ -414,7 +423,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="float32", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([
......@@ -507,7 +517,7 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist(
......@@ -532,7 +542,7 @@ class TestDataset(unittest.TestCase):
except Exception as e:
self.assertTrue(False)
dataset2 = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset2 = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset2.set_use_var(slots_vars)
dataset2.set_batch_size(32)
dataset2.set_thread(3)
......@@ -573,7 +583,7 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="float32", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist(
......@@ -628,7 +638,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_input_type(1)
dataset.set_batch_size(1)
dataset.set_thread(2)
......@@ -707,7 +718,7 @@ class TestDatasetWithFetchHandler(unittest.TestCase):
inputs(list): inputs of get_dataset
files(list): files of get_dataset
"""
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist(files)
......@@ -864,7 +875,8 @@ class TestDataset2(unittest.TestCase):
except ImportError as e:
print("warning: no mpi4py")
exe.run(startup_program)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([
......@@ -884,9 +896,6 @@ class TestDataset2(unittest.TestCase):
"""
Testcase for InMemoryDataset from create to run.
"""
self.skipTest("parameter server will add pslib UT later")
with open("test_in_memory_dataset2_run2_a.txt", "w") as f:
data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
......@@ -902,7 +911,7 @@ class TestDataset2(unittest.TestCase):
train_program = fluid.Program()
startup_program = fluid.Program()
scope = fluid.Scope()
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
with fluid.program_guard(train_program, startup_program):
slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"]
slots_vars = []
......@@ -936,7 +945,8 @@ class TestDataset2(unittest.TestCase):
except ImportError as e:
print("warning: no mpi4py")
exe.run(startup_program)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([
......@@ -952,6 +962,63 @@ class TestDataset2(unittest.TestCase):
print("warning: catch expected error")
fleet._opt_info = None
fleet._fleet_ptr = None
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_rank_offset("")
dataset.set_pv_batch_size(1)
dataset.set_hdfs_config("", "")
d = paddle.fleet.DatasetBase()
try:
dataset.set_feed_type("MultiSlotInMemoryDataFeed")
except:
print("warning: catch expected error")
dataset.thread_num = 0
try:
dataset._prepare_to_run()
except:
print("warning: catch expected error")
dataset.set_parse_logkey(True)
dataset.set_merge_by_sid(True)
dataset.set_enable_pv_merge(True)
try:
dataset.preprocess_instance()
except:
print("warning: catch expected error")
try:
dataset.set_current_phase(1)
except:
print("warning: catch expected error")
try:
dataset.postprocess_instance()
except:
print("warning: catch expected error")
dataset.set_fleet_send_batch_size(1024)
try:
dataset.global_shuffle()
except:
print("warning: catch expected error")
dataset.get_pv_data_size()
dataset.get_memory_data_size()
dataset.get_shuffle_data_size()
dataset = paddle.fleet.DatasetFactory().create_dataset(
"QueueDataset")
try:
dataset.local_shuffle()
except:
print("warning: catch expected error")
try:
dataset.global_shuffle()
except:
print("warning: catch expected error")
dataset = paddle.fleet.FileInstantDataset()
try:
dataset.local_shuffle()
except:
print("warning: catch expected error")
try:
dataset.global_shuffle()
except:
print("warning: catch expected error")
os.remove("./test_in_memory_dataset2_run2_a.txt")
os.remove("./test_in_memory_dataset2_run2_b.txt")
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid as fluid
import numpy as np
import six
......@@ -96,7 +97,8 @@ class DatasetLoaderTestBase(unittest.TestCase):
def check_batch_number(self, place, randomize_batch_num=False):
main_prog, startup_prog, feeds = self.build_network()
dataset = fluid.DatasetFactory().create_dataset(self.dataset_name)
dataset = paddle.fleet.DatasetFactory().create_dataset(
self.dataset_name)
dataset.set_batch_size(BATCH_SIZE)
if isinstance(place, fluid.CPUPlace):
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid as fluid
import paddle
from op_test import OpTest
class TestFlattenOp(OpTest):
def setUp(self):
self.op_type = "flatten_contiguous_range"
self.start_axis = 0
self.stop_axis = -1
self.init_test_case()
self.inputs = {"X": np.random.random(self.in_shape).astype("float64")}
self.init_attrs()
self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.in_shape).astype("float32")
}
def test_check_output(self):
self.check_output(no_check_set=["XShape"])
def test_check_grad(self):
self.check_grad(["X"], "Out")
def init_test_case(self):
self.in_shape = (3, 2, 5, 4)
self.start_axis = 0
self.stop_axis = -1
self.new_shape = (120)
def init_attrs(self):
self.attrs = {
"start_axis": self.start_axis,
"stop_axis": self.stop_axis
}
class TestFlattenOp_1(TestFlattenOp):
def init_test_case(self):
self.in_shape = (3, 2, 5, 4)
self.start_axis = 1
self.stop_axis = 2
self.new_shape = (3, 10, 4)
def init_attrs(self):
self.attrs = {
"start_axis": self.start_axis,
"stop_axis": self.stop_axis
}
class TestFlattenOp_2(TestFlattenOp):
def init_test_case(self):
self.in_shape = (3, 2, 5, 4)
self.start_axis = 0
self.stop_axis = 1
self.new_shape = (6, 5, 4)
def init_attrs(self):
self.attrs = {
"start_axis": self.start_axis,
"stop_axis": self.stop_axis
}
class TestFlattenOp_3(TestFlattenOp):
def init_test_case(self):
self.in_shape = (3, 2, 5, 4)
self.start_axis = 0
self.stop_axis = 2
self.new_shape = (30, 4)
def init_attrs(self):
self.attrs = {
"start_axis": self.start_axis,
"stop_axis": self.stop_axis
}
class TestFlattenOp_4(TestFlattenOp):
def init_test_case(self):
self.in_shape = (3, 2, 5, 4)
self.start_axis = -2
self.stop_axis = -1
self.new_shape = (3, 2, 20)
def init_attrs(self):
self.attrs = {
"start_axis": self.start_axis,
"stop_axis": self.stop_axis
}
class TestFlattenOp_5(TestFlattenOp):
def init_test_case(self):
self.in_shape = (3, 2, 5, 4)
self.start_axis = 2
self.stop_axis = 2
self.new_shape = (3, 2, 5, 4)
def init_attrs(self):
self.attrs = {
"start_axis": self.start_axis,
"stop_axis": self.stop_axis
}
class TestFlattenOpSixDims(TestFlattenOp):
def init_test_case(self):
self.in_shape = (3, 2, 3, 2, 4, 4)
self.start_axis = 3
self.stop_axis = 5
self.new_shape = (3, 2, 3, 32)
def init_attrs(self):
self.attrs = {
"start_axis": self.start_axis,
"stop_axis": self.stop_axis
}
class TestFlatten2OpError(unittest.TestCase):
def test_errors(self):
image_shape = (2, 3, 4, 4)
x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
image_shape[3]).reshape(image_shape) / 100.
x = x.astype('float32')
def test_ValueError1():
x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
out = paddle.flatten(x_var, start_axis=2, stop_axis=1)
self.assertRaises(ValueError, test_ValueError1)
def test_ValueError2():
x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
paddle.flatten(x_var, start_axis=10, stop_axis=1)
self.assertRaises(ValueError, test_ValueError2)
def test_ValueError3():
x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32')
paddle.flatten(x_var, start_axis=2, stop_axis=10)
self.assertRaises(ValueError, test_ValueError3)
def test_type():
# dtype must be float32, float64, int8, int32, int64.
x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
image_shape[3]).reshape(image_shape) / 100.
x2 = x2.astype('float16')
x2_var = paddle.data(name='x2', shape=[3, 2, 4, 5], dtype='float16')
paddle.flatten(x2_var)
self.assertRaises(TypeError, test_type)
def test_InputError():
out = paddle.flatten(x)
self.assertRaises(ValueError, test_InputError)
class TestFlattenPython(unittest.TestCase):
def test_python_api(self):
image_shape = (2, 3, 4, 4)
x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
image_shape[3]).reshape(image_shape) / 100.
x = x.astype('float32')
def test_InputError():
out = paddle.flatten(x)
self.assertRaises(ValueError, test_InputError)
def test_Negative():
paddle.enable_imperative()
img = paddle.imperative.to_variable(x)
out = paddle.flatten(img, start_axis=-2, stop_axis=-1)
return out.numpy().shape
res_shape = test_Negative()
self.assertTrue((2, 3, 16) == res_shape)
if __name__ == "__main__":
unittest.main()
......@@ -14,6 +14,7 @@
"""Test cases for role makers."""
from __future__ import print_function
import paddle
import os
import unittest
......@@ -162,7 +163,8 @@ class TestCloudRoleMaker2(unittest.TestCase):
data = "1 1 1 1\n"
f.write(data)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"])
dataset.set_use_var([show, label])
dataset.load_into_memory()
......
......@@ -30,6 +30,7 @@ def fusion_gru(
wh, # D x 3D
bias, # 1 x 3D
is_reverse,
origin_mode,
act_state,
act_gate):
return gru(fc(x, wx, bias),
......@@ -40,7 +41,8 @@ def fusion_gru(
(1, wh.shape[1]), dtype='float32'),
is_reverse,
act_state,
act_gate)
act_gate,
origin_mode=origin_mode)
class TestFusionGRUOp(OpTest):
......@@ -57,6 +59,8 @@ class TestFusionGRUOp(OpTest):
self.with_bias = True
self.act_state = 'tanh'
self.act_gate = 'sigmoid'
self.origin_mode = False
self.use_mkldnn = False
self.set_confs()
T = sum(self.lod[0])
......@@ -73,7 +77,7 @@ class TestFusionGRUOp(OpTest):
(N, self.D), dtype='float32')
_, _, _, hidden = fusion_gru(
x, self.lod, h0, wx, wh, bias, self.is_reverse,
x, self.lod, h0, wx, wh, bias, self.is_reverse, self.origin_mode,
ACTIVATION[self.act_state], ACTIVATION[self.act_gate])
self.inputs = {'X': (x, self.lod), 'WeightX': wx, 'WeightH': wh}
......@@ -89,7 +93,9 @@ class TestFusionGRUOp(OpTest):
self.attrs = {
'activation': self.act_state,
'gate_activation': self.act_gate,
'is_reverse': self.is_reverse
'is_reverse': self.is_reverse,
'origin_mode': self.origin_mode,
'use_mkldnn': self.use_mkldnn
}
def test_check_output(self):
......
......@@ -180,6 +180,51 @@ class TestLayer(LayerTest):
self.assertRaises(TypeError, test_type)
def test_Flatten(self):
inp = np.ones([3, 4, 4, 5], dtype='float32')
with self.static_graph():
t = layers.data(
name='data',
shape=[3, 4, 4, 5],
dtype='float32',
append_batch_size=False)
flatten = nn.Flatten()
ret = flatten(t)
static_ret = self.get_static_graph_result(
feed={'data': inp}, fetch_list=[ret])[0]
with self.dynamic_graph():
t = base.to_variable(inp)
flatten = nn.Flatten()
dy_ret = flatten(t)
dy_ret_value = dy_ret.numpy()
self.assertTrue(np.array_equal(static_ret, dy_ret_value))
with self.static_graph():
# the input of Linear must be Variable.
def test_Variable():
inp = np.ones([3, 32, 32], dtype='float32')
linear = nn.Linear(
32,
4,
bias_attr=fluid.initializer.ConstantInitializer(value=1))
linear_ret1 = linear(inp)
self.assertRaises(TypeError, test_Variable)
# the input dtype of Linear must be float16 or float32 or float64
# float16 only can be set on GPU place
def test_type():
inp = np.ones([3, 32, 32], dtype='int32')
linear = nn.Linear(
32,
4,
bias_attr=fluid.initializer.ConstantInitializer(value=1))
linear_ret2 = linear(inp)
self.assertRaises(TypeError, test_type)
def test_layer_norm(self):
inp = np.ones([3, 32, 32], dtype='float32')
with self.static_graph():
......
......@@ -16,6 +16,7 @@ TestCases for Monitor
"""
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
......@@ -51,7 +52,8 @@ class TestDatasetWithStat(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -13,6 +13,7 @@
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import numpy as np
......
......@@ -17,6 +17,7 @@ no_check_set_white_list = [
'fake_quantize_range_abs_max',
'coalesce_tensor',
'flatten2',
'flatten_contiguous_range',
'lrn',
'squeeze2',
'reshape2',
......
......@@ -15,7 +15,10 @@
import sys
import os
__all__ = ['TrainerDesc', 'MultiTrainer', 'DistMultiTrainer', 'PipelineTrainer']
__all__ = [
'TrainerDesc', 'MultiTrainer', 'DistMultiTrainer', 'PipelineTrainer',
'HeterXpuTrainer'
]
class TrainerDesc(object):
......@@ -48,6 +51,43 @@ class TrainerDesc(object):
self._program = None
self._infer = False
def _set_heter_info(self, ret):
#ret = = fu.split_program_by_device(program)
#start_list, end_list, send_list, recv_list, program_list = fu.split_program_by_device(program)
#if len(start_list) != 3:
# print("start_list len=", len(start_list), " will not set heter info")
# return
#for i in start_list[0]:
# self.proto_desc.op_run_start_idx.append(i)
#for i in end_list[0]:
# self.proto_desc.op_run_end_idx.append(i)
#for i in send_list[0]:
# self.proto_desc.op_run_send_list.append(i)
#for i in recv_list[0]:
# self.proto_desc.op_run_recv_list.append(i)
if ret is None:
return
#for i in ret[0]: # start_list[1]:
# self.proto_desc.xpu_start_idx.append(i)
self.proto_desc.xpu_start_idx = ret[0]
#for i in ret[1]: #end_list[1]:
# self.proto_desc.o_end_idx.append(i)
self.proto_desc.xpu_end_idx = ret[1]
for i in ret[2]: # send_list[1]:
self.proto_desc.xpu_send_list.append(i)
for i in ret[3]: # recv_list[1]:
self.proto_desc.xpu_recv_list.append(i)
#for i in start_list[2]:
# self.proto_desc.op_run_end_start_idx.append(i)
#for i in end_list[2]:
# self.proto_desc.op_run_end_idx.append(i)
#for i in send_list[2]:
# self.proto_desc.op_run_end_send_list.append(i)
#for i in recv_list[2]:
# self.proto_desc.op_run_end_recv_list.append(i)
def _set_fetch_var_and_info(self, fetch_vars, fetch_info, print_period):
# convert fetch_info to list
fetch_info = list(fetch_info)
......@@ -122,6 +162,10 @@ class TrainerDesc(object):
for param in dump_param:
self.proto_desc.dump_param.append(param)
def _set_worker_places(self, worker_places):
for place in worker_places:
self.proto_desc.worker_places.append(place)
def _set_thread_barrier(self, thread_barrier):
self.proto_desc.thread_barrier = thread_barrier
......@@ -272,6 +316,30 @@ class DistMultiTrainer(TrainerDesc):
self._device_worker._gen_worker_desc(self.proto_desc)
class HeterXpuTrainer(TrainerDesc):
"""
Implement of HeterXpuTrainer.
It's for Distributed training.
"""
def __init__(self):
super(HeterXpuTrainer, self).__init__()
pass
def _set_program(self, program):
super(HeterXpuTrainer, self)._set_program(program)
self._program = program
def _gen_trainer_desc(self):
super(HeterXpuTrainer, self)._gen_trainer_desc()
self.proto_desc.class_name = "HeterXpuTrainer"
if self._program == None:
raise RuntimeError("None Program")
self._device_worker._set_infer(self._infer)
self._device_worker._set_program(self._program)
self._device_worker._gen_worker_desc(self.proto_desc)
class PipelineTrainer(TrainerDesc):
"""
Implement of PipelineTrainer.
......
......@@ -22,7 +22,7 @@ from paddle.fluid.log_helper import get_logger
local_logger = get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer
from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer, HeterXpuTrainer
from .device_worker import Hogwild, DownpourSGD, Section, DownpourSGDOPT
from .framework import Variable
from multiprocessing import Process, Manager
......@@ -75,6 +75,8 @@ class TrainerFactory(object):
if opt_info.get("dump_param") is not None and len(
opt_info.get("dump_param")) != 0:
trainer._set_dump_param(opt_info["dump_param"])
if opt_info.get("worker_places") is not None:
trainer._set_worker_places(opt_info["worker_places"])
if opt_info.get("enable_random_dump") is not None:
trainer._set_enable_random_dump(opt_info[
"enable_random_dump"])
......
......@@ -63,6 +63,7 @@ from .layer.common import Pool2D #DEFINE_ALIAS
from .layer.common import Pad2D #DEFINE_ALIAS
from .layer.common import Embedding #DEFINE_ALIAS
from .layer.common import Linear #DEFINE_ALIAS
from .layer.common import Flatten #DEFINE_ALIAS
from .layer.common import UpSample #DEFINE_ALIAS
from .layer.conv import Conv2D #DEFINE_ALIAS
from .layer.conv import Conv2DTranspose #DEFINE_ALIAS
......
......@@ -39,6 +39,7 @@ from .common import Pool2D #DEFINE_ALIAS
from .common import Pad2D #DEFINE_ALIAS
from .common import Embedding #DEFINE_ALIAS
from .common import Linear #DEFINE_ALIAS
from .common import Flatten #DEFINE_ALIAS
from .common import UpSample #DEFINE_ALIAS
from .conv import Conv2D #DEFINE_ALIAS
from .conv import Conv2DTranspose #DEFINE_ALIAS
......
......@@ -17,6 +17,7 @@ from ...fluid.dygraph import BilinearTensorProduct #DEFINE_ALIAS
from ...fluid.dygraph import Pool2D #DEFINE_ALIAS
from ...fluid.dygraph import Embedding #DEFINE_ALIAS
from ...fluid.dygraph import Linear #DEFINE_ALIAS
from ...fluid.dygraph import Flatten #DEFINE_ALIAS
from ...fluid.dygraph import layers
from .. import functional as F
......
......@@ -25,7 +25,6 @@ import numpy as np
from ..fluid.layers import cast #DEFINE_ALIAS
from ..fluid.layers import expand #DEFINE_ALIAS
from ..fluid.layers import expand_as #DEFINE_ALIAS
from ..fluid.layers import flatten #DEFINE_ALIAS
from ..fluid.layers import reshape #DEFINE_ALIAS
from ..fluid.layers import scatter #DEFINE_ALIAS
from ..fluid.layers import slice #DEFINE_ALIAS
......@@ -169,6 +168,114 @@ def flip(x, axis, name=None):
reverse = flip #DEFINE_ALIAS
def flatten(x, start_axis=0, stop_axis=-1, name=None):
"""
**Flatten op**
Flattens a contiguous range of axes in a tensor according to start_axis and stop_axis.
For Example:
.. code-block:: text
Case 1:
Given
X.shape = (3, 100, 100, 4)
and
start_axis = 1
end_axis = 2
We get:
Out.shape = (3, 1000 * 100, 2)
Case 2:
Given
X.shape = (3, 100, 100, 4)
and
start_axis = 0
stop_axis = -1
We get:
Out.shape = (3 * 100 * 100 * 4)
Args:
x (Variable): A tensor of number of dimentions >= axis. A tensor with data type float32,
float64, int8, int32, int64.
start_axis (int): the start axis to flatten
stop_axis (int): the stop axis to flatten
name(str, Optional): For details, please refer to :ref:`api_guide_Name`.
Generally, no setting is required. Default: None.
Returns:
Variable: A tensor with the contents of the input tensor, with input \
axes flattened by indicated start axis and end axis. \
A Tensor with data type same as input x.
Raises:
ValueError: If x is not a Variable.
ValueError: If start_axis or stop_axis is illegal.
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.enable_imperative()
image_shape=(2, 3, 4, 4)
x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]).reshape(image_shape) / 100.
x = x.astype('float32')
img = paddle.imperative.to_variable(x)
out = paddle.flatten(img, start_axis=1, stop_axis=2)
# out shape is [2, 12, 4]
"""
if not (isinstance(x, Variable)):
raise ValueError("The input x should be a Variable")
check_variable_and_dtype(
x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64'], 'flatten')
helper = LayerHelper('flatten', **locals())
x_dim = len(x.shape)
if not (isinstance(start_axis, int)) or (
start_axis > x_dim - 1) or start_axis < -x_dim:
raise ValueError(
"The start_axis should be a int, and in range [-rank(x), rank(x))")
if not (isinstance(stop_axis, int)) or (
stop_axis > x_dim - 1) or stop_axis < -x_dim:
raise ValueError(
"The stop_axis should be a int, and in range [-rank(x), rank(x))")
if start_axis < 0:
start_axis = start_axis + x_dim
if stop_axis < 0:
stop_axis = stop_axis + x_dim
if start_axis > stop_axis:
raise ValueError("The stop_axis should be larger than stat_axis")
if in_dygraph_mode():
dy_out, _ = core.ops.flatten_contiguous_range(
x, 'start_axis', start_axis, 'stop_axis', stop_axis)
return dy_out
out = helper.create_variable_for_type_inference(x.dtype)
x_shape = helper.create_variable_for_type_inference(x.dtype)
helper.append_op(
type='flatten_contiguous_range',
inputs={"X": x},
outputs={'Out': out,
'XShape': x_shape},
attrs={"start_axis": start_axis,
"stop_axis": stop_axis})
return out
def roll(x, shifts, axis=None, name=None):
"""
:alias_main: paddle.roll
......@@ -252,13 +359,18 @@ def roll(x, shifts, axis=None, name=None):
return out
def stack(x, axis=0, out=None, name=None):
def stack(x, axis=0, name=None):
"""
:alias_main: paddle.stack
:alias: paddle.stack,paddle.tensor.stack,paddle.tensor.manipulation.stack
:alias: paddle.stack, paddle.tensor.stack, paddle.tensor.manipulation.stack
This OP stacks all the input tensors ``x`` along ``axis`` dimemsion.
All tensors must be of the same shape and same dtype.
For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked
tensor is [N, A, B]; if ``axis == 1``, the shape of stacked
tensor is [A, N, B], etc.
This OP stacks all the inputs :code:`x` along axis.
.. code-block:: text
......@@ -284,7 +396,6 @@ def stack(x, axis=0, out=None, name=None):
Case 2:
Input:
x[0].shape = [1, 2]
x[0].data = [ [1.0 , 2.0 ] ]
......@@ -295,7 +406,7 @@ def stack(x, axis=0, out=None, name=None):
Attrs:
axis = 1 or axis = -2
axis = 1 or axis = -2 # If axis = -2, axis = axis+ndim(x[0])+1 = -2+2+1 = 1.
Output:
Out.shape = [1, 3, 2]
......@@ -304,65 +415,40 @@ def stack(x, axis=0, out=None, name=None):
[5.0, 6.0] ] ]
Args:
x (Variable|list(Variable)): Input :code:`x` can be a single Tensor, a :code:`list` of Tensors.
If :code:`x` is a :code:`list`, the shapes of all these Tensors
must be the same. Supposing input is N dims
Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims
Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
Support data types: float32, float64, int32, int64.
axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`.
R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`.
The default value of axis is 0.
x (Tensor|list[Tensor]): Input ``x`` can be a single tensor, or a ``list`` of tensors.
If ``x`` is a ``list``, the Tensors in ``x``
must be of the same shape and dtype. Support data types: float32, float64, int32, int64.
axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
where ``R`` is the number of dimensions of the first input tensor ``x[0]``.
If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
Returns:
Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`.
Tensor: The stacked tensor with same data type as input.
Example:
.. code-block:: python
import numpy as np
import paddle
import paddle.fluid as fluid
import numpy as np
data1 = np.array([[1.0, 2.0]])
data2 = np.array([[3.0, 4.0]])
data3 = np.array([[5.0, 6.0]])
with fluid.dygraph.guard():
x1 = fluid.dygraph.to_variable(data1)
x2 = fluid.dygraph.to_variable(data2)
x3 = fluid.dygraph.to_variable(data3)
result = paddle.stack([x1, x2, x3], axis=0)
# result shape: [3, 1, 2]
# result value: [[[1.0, 2.0]],
# [[3.0, 4.0]],
# [[5.0, 6.0]]]
"""
helper = LayerHelper('stack', **locals())
axis = 0 if axis is None else axis
if not isinstance(x, list) and not isinstance(x, tuple):
x = [x]
out = helper.create_variable_for_type_inference(x[0].dtype)
if not in_dygraph_mode() and \
x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
"number of the elements must be 1, but received %s." % len(x)
out_index = helper.create_variable_for_type_inference(dtype="int32")
helper.append_op(
type='tensor_array_to_tensor',
inputs={'X': x[0]},
outputs={'Out': [out],
'OutIndex': [out_index]},
attrs={'axis': axis,
'use_stack': True})
else:
helper.append_op(
type='stack',
inputs={'X': x},
outputs={'Y': out},
attrs={'axis': axis})
return out
paddle.enable_imperative()
x1 = paddle.imperative.to_variable(data1)
x2 = paddle.imperative.to_variable(data2)
x3 = paddle.imperative.to_variable(data3)
out = paddle.stack([x1, x2, x3], axis=0)
print(out.shape) # [3, 1, 2]
print(out.numpy())
# [[[1., 2.]],
# [[3., 4.]],
# [[5., 6.]]]
"""
return layers.stack(x, axis, name)
def split(x, num_or_sections, axis=0, name=None):
......
......@@ -16,8 +16,9 @@ from .plot import Ploter
from .profiler import ProfilerOptions
from .profiler import Profiler
from .profiler import get_profiler
from .deprecated import deprecated
__all__ = ['dump_config', 'Ploter']
__all__ = ['dump_config', 'Ploter', 'deprecated']
#TODO: define new api under this directory
# __all__ = ['unique_name',
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
decorator to deprecate a function or class
"""
import warnings
import functools
import paddle
def deprecated(update_to="", since="", reason=""):
"""Decorate a function to signify its deprecation.
This function wraps a method that will soon be removed and does two things:
- The docstring of the API will be modified to include a notice
about deprecation."
- Raises a :class:`~exceptions.DeprecatedWarning` when old API is called.
Args:
since(str): The version at which the decorated method is considered deprecated.
update_to(str): The new API users should use.
reason(str): The reason why the API is deprecated.
Returns:
decorator: decorated function or class.
"""
def decorator(func):
"""construct warning message, and return a decorated function or class."""
assert isinstance(update_to, str), 'type of "update_to" must be str.'
assert isinstance(since, str), 'type of "since" must be str.'
assert isinstance(reason, str), 'type of "reason" must be str.'
_since = since.strip()
_update_to = update_to.strip()
_reason = reason.strip()
msg = 'API "{}.{}" is deprecated'.format(func.__module__, func.__name__)
if len(_since) > 0:
msg += " since {}".format(_since)
msg += ", and may be removed in future versions."
if len(_update_to) > 0:
assert _update_to.startswith(
"paddle."
), 'Argument update_to must start with "paddle.", your value is "{}"'.format(
update_to)
msg += ' Use "{}" instead.'.format(_update_to)
if len(_reason) > 0:
msg += "\n reason: {}".format(_reason)
@functools.wraps(func)
def wrapper(*args, **kwargs):
"""deprecated warning should be fired in 3 circumstances:
1. current version is develop version, i.e. "0.0.0", because we assume develop version is always the latest version.
2. since version is empty, in this case, API is deprecated in all versions.
3. current version is newer than since version.
"""
v_current = [int(i) for i in paddle.__version__.split(".")]
v_current += [0] * (4 - len(v_current))
v_since = [int(i) for i in _since.split(".")]
v_since += [0] * (4 - len(v_since))
if paddle.__version__ == "0.0.0" or _since == "" or v_current >= v_since:
warnings.simplefilter('always',
DeprecationWarning) # turn off filter
warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
warnings.simplefilter('default',
DeprecationWarning) # reset filter
return func(*args, **kwargs)
return wrapper
return decorator
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册