diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 5b03cbf8c7f844e163020ca17d25dc4b732fe636..ecf2dbc81762a59d4d826ae8f5dfc0ab48a28910 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -118,7 +118,7 @@ function(op_library TARGET) "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" -"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op") +"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index e1cb683e1ecf12d507a954003a8fae6312b85324..ff2d08bb772605e5b214b41037e301ced2e85dcf 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -27,6 +27,7 @@ add_subdirectory(fleet) add_subdirectory(io) #ddim lib proto_library(framework_proto SRCS framework.proto) +proto_library(heter_service_proto SRCS heter_service.proto) proto_library(data_feed_proto SRCS data_feed.proto) proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto data_feed_proto) @@ -195,20 +196,37 @@ cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc o if(WITH_DISTRIBUTE) cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc - data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc + heterxpu_trainer.cc + data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry - device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper box_wrapper lodtensor_printer + device_context scope framework_proto trainer_desc_proto glog fs shell + fleet_wrapper heter_wrapper box_wrapper lodtensor_printer lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS} - graph_to_program_pass variable_helper data_feed_proto timer monitor) + graph_to_program_pass variable_helper data_feed_proto timer monitor + heter_service_proto) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +elseif(WITH_PSLIB) + cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc + dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc + heterxpu_trainer.cc + data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc + pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry + device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog + lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method + graph_to_program_pass variable_helper timer monitor pslib_brpc ) + # TODO: Fix these unittest failed on Windows + if(NOT WIN32) + cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) + endif() else() cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc - data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc + heterxpu_trainer.cc + data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry - device_context scope framework_proto data_feed_proto trainer_desc_proto glog - lod_rank_table fs shell fleet_wrapper box_wrapper lodtensor_printer feed_fetch_method + device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog + lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method graph_to_program_pass variable_helper timer monitor) # TODO: Fix these unittest failed on Windows if(NOT WIN32) diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 07470ef8532a0a0526d2e6228571716da37d78a8..04befbe1ca01d4bfec5872a63565f21d110a6c67 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -27,6 +27,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/heter_service.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" @@ -51,10 +52,23 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size); class FleetWrapper; +#ifdef PADDLE_WITH_PSLIB +class HeterWrapper; +#endif + class PullDenseWorker { public: virtual ~PullDenseWorker() {} virtual void Initialize(const TrainerDesc& param); +#ifdef PADDLE_WITH_CUDA + void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); } + + void AddPlace(const paddle::platform::Place place) { + places_.push_back(place); + } + + void AddThreadScope(Scope* scope) { thread_scopes_.push_back(scope); } +#endif int Start(); void Stop(); void SetRootScope(Scope* scope) { root_scope_ = scope; } @@ -62,6 +76,7 @@ class PullDenseWorker { void ResetThreadVersion(uint64_t table_id); void Wait(std::vector<::std::future>* status_vec); void PullDense(bool force_update = false); + void CreatePinVar(); int GetThreadIdByScope(const Scope* scope); void SetThreadIdByScope(const Scope* scope, int tid); static std::shared_ptr GetInstance() { @@ -105,6 +120,12 @@ class PullDenseWorker { std::mutex mutex_for_mean_scale_; float total_batch_num_ = 0; std::unordered_map scope_to_thread_id_; + +#ifdef PADDLE_WITH_CUDA + std::vector copy_streams_; + std::vector places_; + std::vector thread_scopes_; +#endif }; // should incorporate different type of device @@ -126,6 +147,8 @@ class DeviceWorker { virtual void BindingDataFeedMemory() = 0; virtual void SetRootScope(Scope* root_scope); virtual void SetDataFeed(DataFeed* data_feed); + virtual void SetWorkerNum(int num) {} + virtual void CacheProgram(const ProgramDesc& main_program) {} virtual void SetNeedDumpField(bool need_dump_field) { need_dump_field_ = need_dump_field; } @@ -161,6 +184,7 @@ class DeviceWorker { FetchConfig fetch_config_; bool use_cvm_; bool no_cvm_; + TrainerDesc trainer_desc_; // dump params or grads for debug bool need_dump_param_; @@ -306,6 +330,87 @@ class DownpourWorkerOpt : public DownpourWorker { uint64_t async_tid_ = 0; }; +#ifdef PADDLE_WITH_PSLIB +class HeterCpuWorker : public HogwildWorker { + public: + HeterCpuWorker() {} + virtual ~HeterCpuWorker() {} + virtual void Initialize(const TrainerDesc& desc); + virtual void TrainFiles(); + virtual void TrainFilesWithProfiler(); + virtual void SetNeedDump(bool need_dump_field); + virtual void SetChannelWriter(ChannelObject* queue); + virtual void SetWorkerNum(int num) { worker_num_ = num; } + virtual void Schedule(int taskid); + virtual void JumpContext(std::shared_ptr task); + virtual void CacheProgram(const ProgramDesc& main_program) { + new (&program_) ProgramDesc(main_program); + } + virtual void GetXpuOpIndex(); + + protected: + std::shared_ptr fleet_ptr_; + std::shared_ptr heter_ptr_; + std::shared_ptr pull_dense_worker_; + void FillSparseValue(std::shared_ptr task, size_t table_id); + void PushGradients(); + void CollectLabelInfo(std::shared_ptr task, size_t table_id); + void AdjustInsWeight(std::shared_ptr task); + void DumpParam(); + void CopySparseTable(); + void CopyDenseTable(); + void CopyDenseVars(); + + private: + int mpi_rank_; + int worker_num_; + int xpu_begin_op_index_; + int xpu_end_op_index_; + ProgramDesc program_; + HeterObjectPool object_pool_; + HeterList> run_queue_; + HeterList> wait_queue_; + bool need_dump_param_; + std::vector dump_param_; + bool need_to_push_dense_; + bool need_dump_field_; + bool dump_slot_; + bool need_to_push_sparse_; + std::vector dump_fields_; + ChannelWriter writer_; + DownpourWorkerParameter param_; + float scale_datanorm_; + // just save the value in param_ for easy access + std::map label_var_name_; + std::map> sparse_key_names_; + std::map> sparse_value_names_; + std::map> sparse_grad_names_; + std::map> dense_value_names_; + std::map> dense_grad_names_; + platform::Place root_place_; + // actually pushed feasign of each table + std::map> sparse_push_keys_; + + // skipped ops + std::vector skip_ops_; + + std::vector<::std::future> push_sparse_status_; + std::vector<::std::future> push_dense_status_; + + // adjust ins weight + AdjustInsWeightConfig adjust_ins_weight_config_; + std::vector nid_show_; + // check nan and inf during training + std::vector check_nan_var_names_; + // copy table + CopyTableConfig copy_table_config_; + std::map table_dependency_; + std::vector> copy_sparse_tables_; + std::vector> copy_dense_tables_; + std::unordered_map> feasign_set_; +}; +#endif + #if defined(PADDLE_WITH_NCCL) class SectionWorker : public DeviceWorker { public: diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index 80e4000c9dc686bc413b38fcf8298dc8b5399335..67be8db6e80329de4323e4cb8f904a24753f56bc 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -62,6 +62,9 @@ std::shared_ptr DeviceWorkerFactory::CreateDeviceWorker( REGISTER_DEVICE_WORKER_CLASS(HogwildWorker); REGISTER_DEVICE_WORKER_CLASS(DownpourWorker); REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt); +#ifdef PADDLE_WITH_PSLIB +REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker); +#endif #if defined(PADDLE_WITH_NCCL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc index e2a7375df9e46713aebe9f815f93809568b86c0f..4d55d2987f3f39525c1070e3213f3a2e84e18dff 100644 --- a/paddle/fluid/framework/dist_multi_trainer.cc +++ b/paddle/fluid/framework/dist_multi_trainer.cc @@ -35,7 +35,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc, dump_file_num_ = trainer_desc.dump_file_num(); const std::vector readers = dataset->GetReaders(); - + RegisterHeterCallback(); thread_num_ = readers.size(); workers_.resize(thread_num_); for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size(); @@ -55,6 +55,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc, workers_[i]->SetDumpParamVector(dump_param_); workers_[i]->InitRandomDumpConfig(trainer_desc); workers_[i]->Initialize(trainer_desc); + workers_[i]->SetWorkerNum(thread_num_); } VLOG(3) << "going to initialize pull dense worker"; @@ -64,6 +65,13 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc, SetDebug(trainer_desc.debug()); } +void DistMultiTrainer::RegisterHeterCallback() { + auto fleet_ptr = FleetWrapper::GetInstance(); + fleet_ptr->RegisterHeterCallback([this](int worker, int taskid) { + // workers_[worker]->Schedule(taskid); + }); +} + void DistMultiTrainer::InitDumpEnv() { queue_ = paddle::framework::MakeChannel(); for (int i = 0; i < thread_num_; ++i) { @@ -90,6 +98,9 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program, workers_[i]->SetRootScope(root_scope_); workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->BindingDataFeedMemory(); +#ifdef PADDLE_WITH_PSLIB + workers_[i]->CacheProgram(main_program); +#endif } // Scope* -> thread id, it will be used in push_dense op for (int i = 0; i < thread_num_; ++i) { @@ -104,6 +115,11 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) { } pull_dense_worker_->SetRootScope(root_scope_); pull_dense_worker_->Start(); +#ifdef PADDLE_WITH_PSLIB + for (int i = 0; i < thread_num_; ++i) { + workers_[i]->GetXpuOpIndex(); + } +#endif VLOG(3) << "init other env done."; } diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index 3f70835c9d312a652cd917ba53fb2f405ab401cc..1c64bf1d3f7f31f42308395b5b054f62fd97b429 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -379,7 +379,7 @@ void DownpourWorker::CopyDenseTable() { pull_dense_status.resize(0); fleet_ptr_->PullDenseVarsAsync(*root_scope_, dest_table, dense_value_names_[dest_table], - &pull_dense_status); + &pull_dense_status, true); for (auto& t : pull_dense_status) { t.wait(); auto status = t.get(); diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 6922f92c8f7a3aa43f13fda59f2631f8529d5cc7..55ac44c5e32cef47f5204182a84b05ba0cd1ef1d 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -19,4 +19,6 @@ else() cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope) endif(WITH_GLOO) +cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto) + cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 6f571fa8d817bfa0323ea22b4b14ee96111642c4..335cbc382c178b1a14949764f2908dc402298868 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -154,6 +154,219 @@ void FleetWrapper::CreateClient2ClientConnection() { #endif } +#ifdef PADDLE_WITH_PSLIB +void FleetWrapper::HeterPullSparseVars( + int workerid, std::shared_ptr task, const uint64_t table_id, + const std::vector& var_names, int fea_value_dim, + const std::vector& var_emb_names) { + std::vector<::std::future> pull_sparse_status; + pull_sparse_status.resize(0); + auto& scope = *(task->scope_); + auto& fea_keys = (task->features_)[table_id]; + auto& fea_values = (task->feature_values_)[table_id]; + fea_keys.clear(); + for (size_t var_index = 0; var_index < var_names.size(); ++var_index) { + const std::string& name = var_names[var_index]; + Variable* var = scope.FindVar(name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + CHECK(tensor != nullptr) << "tensor of var " << name << " is null"; + int64_t* ids = tensor->data(); + size_t len = tensor->numel(); + + // skip slots which do not have embedding + const std::string& emb_name = var_emb_names[var_index]; + Variable* emb_var = scope.FindVar(emb_name); + if (emb_var == nullptr) { + continue; + } + + for (auto i = 0u; i < len; ++i) { + if (ids[i] == 0u) { + continue; + } + fea_keys.push_back(static_cast(ids[i])); + } + } + fea_values.resize(fea_keys.size() + 1); + for (auto& t : fea_values) { + t.resize(fea_value_dim); + } + std::vector pull_result_ptr; + for (auto& t : fea_values) { + pull_result_ptr.push_back(t.data()); + } + auto status = pslib_ptr_->_worker_ptr->heter_pull_sparse( + workerid, pull_result_ptr.data(), table_id, fea_keys.data(), + fea_keys.size(), task->taskid_); + pull_sparse_status.push_back(std::move(status)); + for (auto& t : pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); + exit(-1); + } + } +} + +void FleetWrapper::HeterPushSparseVars( + std::shared_ptr task, const uint64_t table_id, + const std::vector& sparse_key_names, + const std::vector& sparse_grad_names, const int emb_dim, + std::vector<::std::future>* push_sparse_status, const bool use_cvm, + const bool dump_slot, const bool no_cvm) { + auto& scope = *(task->scope_); + int batch_size = task->cur_batch_; + int offset = 2; + int slot_offset = 0; + int grad_dim = emb_dim; + int show_index = 0; + int click_index = 1; + auto& fea_keys = (task->features_)[table_id]; + auto& fea_labels = (task->feature_labels_)[table_id]; + auto& push_values = (task->feature_grads_)[table_id]; + auto& sparse_push_keys = (task->sparse_push_keys_)[table_id]; + + if (use_cvm) { + offset = 0; + grad_dim = emb_dim - 2; + } + if (no_cvm) { + offset = 0; + grad_dim = emb_dim; + } + if (dump_slot) { + slot_offset = 1; + show_index = 1; + click_index = 2; + } + CHECK_GE(grad_dim, 0); + + sparse_push_keys.clear(); + sparse_push_keys.reserve(fea_keys.size() + 1); + push_values.resize(fea_keys.size() + 1); + for (auto& t : push_values) { + t.resize(emb_dim + offset + slot_offset); + } + uint64_t fea_idx = 0u; + for (size_t i = 0; + i < sparse_key_names.size() && i < sparse_grad_names.size(); ++i) { + Variable* var = scope.FindVar(sparse_key_names[i]); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + if (tensor == nullptr) { + LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null"; + exit(-1); + } + size_t len = tensor->numel(); + int64_t* ids = tensor->data(); + int slot = 0; + if (dump_slot) { + slot = boost::lexical_cast(sparse_key_names[i]); + } + Variable* g_var = scope.FindVar(sparse_grad_names[i]); + if (g_var == nullptr) { + continue; + } + LoDTensor* g_tensor = g_var->GetMutable(); + if (g_tensor == nullptr) { + LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null"; + exit(-1); + } + float* g = g_tensor->data(); + + if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) { + int dim = emb_dim + offset; + Eigen::Map< + Eigen::Matrix> + g_mat(g, g_tensor->numel() / dim, dim); + g_mat.rightCols(grad_dim) *= batch_size; + } + for (auto id_idx = 0u; id_idx < len; ++id_idx) { + if (ids[id_idx] == 0) { + g += emb_dim; + continue; + } + sparse_push_keys.push_back(ids[id_idx]); + CHECK(fea_idx < push_values.size()); + + if (use_cvm || no_cvm) { + memcpy(push_values[fea_idx].data() + offset + slot_offset, g, + sizeof(float) * emb_dim); + } else { + CHECK(fea_idx < fea_labels.size()); + memcpy(push_values[fea_idx].data() + offset + slot_offset, g, + sizeof(float) * emb_dim); + push_values[fea_idx][show_index] = 1.0f; + push_values[fea_idx][click_index] = + static_cast(fea_labels[fea_idx]); + } + if (dump_slot) { + push_values[fea_idx][0] = static_cast(slot); + } + g += emb_dim; + fea_idx++; + } + } + // slots whose embedding has been stop gradient or + // not involved in forward-backward + uint64_t no_grad_fea_num = 0u; + for (size_t i = sparse_grad_names.size(); i < sparse_key_names.size(); ++i) { + Variable* var = scope.FindVar(sparse_key_names[i]); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + if (tensor == nullptr) { + LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null"; + exit(-1); + } + size_t len = tensor->numel(); + int64_t* ids = tensor->data(); + for (auto id_idx = 0u; id_idx < len; ++id_idx) { + if (ids[id_idx] == 0) { + continue; + } + ++no_grad_fea_num; + } + } + CHECK(fea_idx + no_grad_fea_num == fea_keys.size()) + << "fea_idx: " << fea_idx << " no_grad_fea_num: " << no_grad_fea_num + << " features size: " << fea_keys.size(); + CHECK(fea_idx == sparse_push_keys.size()); + if (fea_idx == 0) { + return; + } + std::vector push_g_vec; + for (auto i = 0u; i < sparse_push_keys.size(); ++i) { + push_g_vec.push_back(push_values[i].data()); + } + auto status = pslib_ptr_->_worker_ptr->push_sparse( + table_id, sparse_push_keys.data(), (const float**)push_g_vec.data(), + sparse_push_keys.size()); + push_sparse_status->push_back(std::move(status)); +} +#endif + +int FleetWrapper::RegisterHeterCallback(HeterCallBackFunc handler) { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "calling FleetWrapper::RegisterHeterCallback"; + VLOG(3) << "pslib_ptr_=" << pslib_ptr_; + VLOG(3) << "_worker_ptr=" << pslib_ptr_->_worker_ptr; + return pslib_ptr_->_worker_ptr->registe_heter_callback(handler); +#else + VLOG(0) << "FleetWrapper::RegisterHeterCallback" + << " does nothing when no pslib"; +#endif + return 0; +} + void FleetWrapper::PullSparseToLocal(const uint64_t table_id, int fea_value_dim) { #ifdef PADDLE_WITH_PSLIB @@ -421,13 +634,17 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim, void FleetWrapper::PullDenseVarsAsync( const Scope& scope, const uint64_t tid, const std::vector& var_names, - std::vector<::std::future>* pull_dense_status) { + std::vector<::std::future>* pull_dense_status, bool in_cpu) { #ifdef PADDLE_WITH_PSLIB auto& regions = _regions[tid]; regions.clear(); regions.resize(var_names.size()); for (auto i = 0u; i < var_names.size(); ++i) { - Variable* var = scope.FindVar(var_names[i]); + std::string varname = var_names[i]; + if (!in_cpu) { + varname = var_names[i] + "pin"; + } + Variable* var = scope.FindVar(varname); LoDTensor* tensor = var->GetMutable(); float* w = tensor->data(); paddle::ps::Region reg(w, tensor->numel()); @@ -485,6 +702,57 @@ void FleetWrapper::PushDenseVarsSync( Scope* scope, const uint64_t table_id, const std::vector& var_names) {} +#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) +void FleetWrapper::PushDenseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector<::std::future>* push_sparse_status, + float scale_datanorm, int batch_size, const paddle::platform::Place& place, + cudaStream_t stream, cudaEvent_t event) { + std::vector regions; + for (auto& t : var_names) { + Variable* var = scope.FindVar(t); + LoDTensor* tensor = var->GetMutable(); + int count = tensor->numel(); + float* g_data = tensor->data(); + + Variable* pin_var = scope.FindVar(t + "pin"); + LoDTensor* pin_tensor = pin_var->GetMutable(); + float* pin_g = pin_tensor->mutable_data(tensor->dims(), + platform::CUDAPinnedPlace()); + memory::Copy(platform::CUDAPinnedPlace(), pin_g, + BOOST_GET_CONST(platform::CUDAPlace, place), g_data, + sizeof(float) * count, stream); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); + cudaEventSynchronize(event); + + float* g = pin_g; + if (scale_datanorm >= 0) { + if (t.find(".batch_size@GRAD") != std::string::npos || + t.find(".batch_sum@GRAD") != std::string::npos) { + Eigen::Map mat(g, 1, count); + float scale = 1.0 / batch_size; + mat *= scale; + } else if (t.find(".batch_square_sum@GRAD") != std::string::npos) { + VLOG(3) << "epsilon: " << scale_datanorm; + for (int i = 0; i < count; ++i) { + g[i] = (g[i] - batch_size * scale_datanorm) / batch_size + + batch_size * scale_datanorm; + } + } + } + paddle::ps::Region reg(g, count); + regions.emplace_back(std::move(reg)); + } + + auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(), + regions.size(), table_id); + if (push_sparse_status) { + push_sparse_status->push_back(std::move(status)); + } +} + +#endif void FleetWrapper::PushDenseVarsAsync( const Scope& scope, const uint64_t table_id, const std::vector& var_names, @@ -1085,8 +1353,8 @@ void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope, push_status.wait(); auto status = push_status.get(); if (status != 0) { - PADDLE_THORW(platform::errors::Fatal( - "push shrink dense param failed, status is [%d].", status)); + // PADDLE_THORW(platform::errors::Fatal( + // "push shrink dense param failed, status is [%d].", status)); sleep(sleep_seconds_before_fail_exit_); exit(-1); } diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index 4c0564f87d48b4ba06bdb75e94fbb80b3d4f448a..92f3a625a755bba4989033c0cd41d9b25591c960 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -28,6 +28,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/heter_service.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" @@ -80,6 +81,24 @@ class FleetWrapper { pull_local_thread_num_ = thread_num; } +#ifdef PADDLE_WITH_PSLIB + void HeterPullSparseVars(int workerid, std::shared_ptr task, + const uint64_t table_id, + const std::vector& var_names, + int fea_dim, + const std::vector& var_emb_names); + + void HeterPushSparseVars( + std::shared_ptr task, const uint64_t table_id, + const std::vector& sparse_key_names, + const std::vector& sparse_grad_names, const int emb_dim, + std::vector<::std::future>* push_sparse_status, + const bool use_cvm, const bool dump_slot, const bool no_cvm); +#endif + + typedef std::function HeterCallBackFunc; + int RegisterHeterCallback(HeterCallBackFunc handler); + // Pull sparse variables from server in sync mode // Param: scope, table_id, var_names, fea_keys, fea_dim, var_emb_names // Param: fea_values @@ -118,15 +137,24 @@ class FleetWrapper { void PullDenseVarsAsync( const Scope& scope, const uint64_t table_id, const std::vector& var_names, - std::vector<::std::future>* pull_dense_status); + std::vector<::std::future>* pull_dense_status, bool in_cpu); // push dense parameters(not gradients) to server in sync mode void PushDenseParamSync(const Scope& scope, const uint64_t table_id, const std::vector& var_names); - // Push dense variables to server in async mode - // Param: scope, table_id, var_names, scale_datanorm, batch_size - // Param: push_sparse_status +// Push dense variables to server in async mode +// Param: scope, table_id, var_names, scale_datanorm, batch_size +// Param: push_sparse_status +#ifdef PADDLE_WITH_CUDA + void PushDenseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector<::std::future>* push_sparse_status, + float scale_datanorm, int batch_size, + const paddle::platform::Place& place, cudaStream_t stream, + cudaEvent_t event); +#endif void PushDenseVarsAsync( const Scope& scope, const uint64_t table_id, const std::vector& var_names, diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc index 45512d6adc49c577c2332ae2ef2c42408138990e..49181cd05f3fac259489fcfc67fd99c559fafeef 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.cc +++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc @@ -54,10 +54,10 @@ void HdfsStore::set(const std::string& key, const std::vector& data) { paddle::framework::fs_remove(tmp); if (i == retry_times_) { VLOG(0) << "fs_open_write failed, retry times reaches limit"; - PADDLE_THROW(platform::errors::PreconditionNotMet( - "fs_open_write failed, retry times reaches" - " limit ", - retry_times_)); + // PADDLE_THROW(platform::errors::PreconditionNotMet( + // "fs_open_write failed, retry times reaches" + // " limit ", + // retry_times_)); } } else { break; @@ -143,9 +143,9 @@ void HdfsStore::wait(const std::vector& keys, break; } } - PADDLE_THROW(platform::errors::ExecutionTimeout( - "TIMEOUT self_rank = %d pair_rank = %d", self_rank_, - last_check_rank)); + // PADDLE_THROW(platform::errors::ExecutionTimeout( + VLOG(0) << "TIMEOUT self_rank = " << self_rank_ + << " pair_rank = " << last_check_rank; } std::this_thread::sleep_for(std::chrono::milliseconds(wait_sleep_ms_)); } diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..b70d5e5fc1ae6c90dac4ebf1d86353e38a79492d --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_wrapper.cc @@ -0,0 +1,308 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/fleet/heter_wrapper.h" +#include +#include +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/device_worker.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/timer.h" +#ifdef PADDLE_WITH_PSLIB + +namespace paddle { +namespace framework { + +std::shared_ptr HeterWrapper::s_instance_ = NULL; +bool HeterWrapper::is_initialized_ = false; + +void HeterWrapper::CreateClient2XpuConnection() { + brpc::ChannelOptions options; + options.protocol = "baidu_std"; + options.connection_type = "single"; + options.timeout_ms = 2000000; + + xpu_channels_.resize(xpu_list_.size()); + for (size_t i = 0; i < xpu_list_.size(); ++i) { + VLOG(3) << "channel init: " << xpu_list_[i]; + xpu_channels_[i].reset(new brpc::Channel()); + if (xpu_channels_[i]->Init(xpu_list_[i].c_str(), "", &options) != 0) { + VLOG(0) << "server channel init fail"; + } + } +} + +void HeterWrapper::RegisterServiceHandler(int cmd, HeterServiceHandler func) { + service_.RegisterServiceHandler(cmd, func); +} + +void HeterWrapper::SetXpuList(const std::vector& xpu_list) { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to set xpu list"; + for (auto& x : xpu_list) { + xpu_list_.push_back(x); + VLOG(3) << "set xpu list: " << x << " size: " << xpu_list_.size(); + } +#endif +} + +void HeterWrapper::StartXpuService(const std::string& ip, uint32_t port) { + std::string ip_port = ip + ":" + std::to_string(port); + VLOG(3) << "xpu server starts at " << ip_port; + + server_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE); + brpc::ServerOptions options; + if (server_.Start(ip_port.c_str(), &options) != 0) { + VLOG(0) << "xpu server start fail"; + } +} + +// void HeterWrapper::SerializeToReq(const std::string& varname, +// Scope* scope, HeterRequest& request) { +// auto* req_var = request.mutable_vars(); + +void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope, + VariableMessage* req_var) { + Variable* var = scope->FindVar(varname); + if (var == nullptr) { + return; + } + LoDTensor* tensor = var->GetMutable(); + req_var->set_varname(varname); + req_var->set_type(LOD_TENSOR); + req_var->set_data_type(static_cast(tensor->type())); + + for (auto& dim : framework::vectorize(tensor->dims())) { + req_var->add_dims(dim); + } + const framework::LoD lod = tensor->lod(); + if (lod.size() > 0) { + req_var->set_lod_level(lod.size()); + for (auto& each : lod) { + VariableMessage::LodData* lod_inner = req_var->add_lod(); + for (auto& d : each) { + lod_inner->add_lod_data(d); + } + } + } + + auto* req_data = req_var->mutable_data(); + req_data->clear(); + req_data->resize(tensor->numel() * SizeOfType(tensor->type())); + char* data_ptr = const_cast(req_data->data()); + + if (platform::is_cpu_place(tensor->place())) { + memcpy(data_ptr, tensor->data(), + tensor->numel() * SizeOfType(tensor->type())); + } +#ifdef PADDLE_WITH_CUDA + else { + memory::Copy(platform::CPUPlace(), data_ptr, + BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), + tensor->data(), + tensor->numel() * SizeOfType(tensor->type()), nullptr); + } +#endif +} + +// void HeterWrapper::DeSerializeToTensor(Scope* scope, +// const HeterRequest* request) { +#ifdef PADDLE_WITH_CUDA +void HeterWrapper::DeSerializeToTensor(Scope* scope, + const VariableMessage& req_var, + platform::Place place, + cudaStream_t stream) { +#else +void HeterWrapper::DeSerializeToTensor(Scope* scope, + const VariableMessage& req_var, + platform::Place place) { +#endif + // const VariableMessage& req_var = request->vars(); + auto* var = scope->FindVar(req_var.varname()); + auto* tensor = var->GetMutable(); + + std::vector vec_dim; + for (auto& x : req_var.dims()) { + vec_dim.push_back(x); + } + tensor->Resize(make_ddim(vec_dim)); + + LoD lod; + for (int i = 0; i < req_var.lod_level(); ++i) { + framework::Vector v; + for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) { + v.push_back(req_var.lod(i).lod_data(j)); + } + lod.push_back(v); + } + tensor->set_lod(lod); + + void* tensor_data = + tensor->mutable_data(place, ToVarType(req_var.data_type())); + +#ifdef PADDLE_WITH_CUDA + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, + platform::CPUPlace(), req_var.data().data(), + tensor->numel() * SizeOfType(tensor->type()), stream); +#else + memcpy(tensor_data, req_var.data().data(), + tensor->numel() * SizeOfType(tensor->type())); +#endif +} + +framework::proto::VarType::Type HeterWrapper::ToVarType( + VariableMessage::Type type) { + switch (type) { + case VariableMessage::FP32: + return framework::proto::VarType::FP32; // NOLINT + case VariableMessage::FP64: + return framework::proto::VarType::FP64; // NOLINT + case VariableMessage::INT32: + return framework::proto::VarType::INT32; // NOLINT + case VariableMessage::INT64: + return framework::proto::VarType::INT64; // NOLINT + case VariableMessage::BOOL: + return framework::proto::VarType::BOOL; // NOLINT + default: + VLOG(0) << "Not support type " << type; + } +} + +void HeterWrapper::StopXpuService(int num) { + HeterRequest request; + HeterResponse response; + brpc::Controller cntl; + request.set_cmd(2); + // for (size_t i = 0; i < xpu_channels_.size(); ++i) { + HeterService_Stub stub(xpu_channels_[num].get()); + stub.service(&cntl, &request, &response, NULL); + if (cntl.Failed()) { + VLOG(0) << "call stop xpu service fail: " << cntl.ErrorText(); + } else { + VLOG(3) << "call stop xpu service success"; + } + // } +} + +void HeterWrapper::EndPass(Scope* scope, int num) { + HeterRequest request; + HeterResponse response; + brpc::Controller cntl; + request.set_cmd(1); + // for (size_t i = 0; i < xpu_channels_.size(); ++i) { + HeterService_Stub stub(xpu_channels_[num].get()); + stub.service(&cntl, &request, &response, NULL); + if (cntl.Failed()) { + VLOG(0) << "call end pass fail: " << cntl.ErrorText(); + } else { + VLOG(3) << "call end pass success"; + for (int j = 0; j < response.vars_size(); ++j) { + DeSerializeToTensor(scope, response.vars(j), platform::CPUPlace()); + } + } + // } +} + +void HeterWrapper::CallRemoteXpu(std::shared_ptr task, + HeterCpuWorker* worker, int mpi_rank, + std::vector& send_vars) { + HeterRequest request; + request.set_cmd(0); + request.set_cur_batch(task->cur_batch_); + + OnHeterRpcDone* done = new OnHeterRpcDone([this, task, worker](void* done) { + auto* closure = (OnHeterRpcDone*)done; + if (closure->cntl.Failed()) { + VLOG(0) << "call xpu fail: " << closure->cntl.ErrorText(); + } else { + VLOG(3) << "call xpu success"; + } + // DeSerializeToTensor(task->scope_, + // closure->response.vars(), platform::CPUPlace()); + for (int i = 0; i < closure->response.vars_size(); ++i) { + DeSerializeToTensor(task->scope_, closure->response.vars(i), + platform::CPUPlace()); + } + + worker->Schedule(task->taskid_); + }); + + // std::vector varnames = {"click", "12345"}; + // //varnames.push_back(send_var); + // //if (send_var == "_generated_var_412") { + // varnames.push_back("filter_by_instag_0.tmp_0"); + // varnames.push_back("filter_by_instag_2.tmp_0"); + // varnames.push_back("filter_by_instag_0.tmp_1"); + // varnames.push_back("concat_1.tmp_0"); + // } + for (auto& varname : send_vars) { + auto* req_var = request.add_vars(); + SerializeToReq(varname, task->scope_, req_var); + } + + int num = mpi_rank % xpu_channels_.size(); + HeterService_Stub stub(xpu_channels_[num].get()); + // stub.service(&cntl, &request, &response, + // brpc::NewCallback(&HeterWrapper::RpcCallBack, + // response, cntl, worker, task)); + stub.service(&done->cntl, &request, &done->response, done); +} + +void HeterWrapper::CallRemoteXpuSync(std::shared_ptr task, + HeterCpuWorker* worker, int mpi_rank, + std::vector& send_vars) { + HeterRequest request; + HeterResponse response; + brpc::Controller cntl; + request.set_cmd(0); + request.set_cur_batch(task->cur_batch_); + + // std::vector varnames = {"concat_1.tmp_0", "click", "12345"}; + for (auto& varname : send_vars) { + auto* req_var = request.add_vars(); + SerializeToReq(varname, task->scope_, req_var); + } + + HeterService_Stub stub(xpu_channels_[0].get()); + stub.service(&cntl, &request, &response, NULL); + if (cntl.Failed()) { + VLOG(0) << "call xpu fail: " << cntl.ErrorText(); + } else { + VLOG(3) << "call xpu success"; + for (int i = 0; i < response.vars_size(); ++i) { + DeSerializeToTensor(task->scope_, response.vars(i), platform::CPUPlace()); + } + } +} + +} // end namespace framework +} // end namespace paddle +#endif diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..6ba4e00fc851b1f1b6e10764b5bc069f515d4b47 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_wrapper.h @@ -0,0 +1,123 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef PADDLE_WITH_PSLIB +#include "paddle/fluid/framework/heter_service.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace framework { + +class HeterCpuWorker; + +typedef std::function HeterRpcCallbackFunc; + +class OnHeterRpcDone : public google::protobuf::Closure { + public: + OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {} + virtual ~OnHeterRpcDone() {} + void Run() { + std::unique_ptr self_guard(this); + handler_(this); + } + + HeterRpcCallbackFunc handler_; + HeterResponse response; + brpc::Controller cntl; +}; + +class HeterWrapper { + public: + virtual ~HeterWrapper() { + server_.Stop(1000); + server_.Join(); + } + + HeterWrapper() {} + + static void HeterRpcCallBack(HeterResponse* response, brpc::Controller* cntl, + HeterCpuWorker* worker, + std::shared_ptr task); + + void CreateClient2XpuConnection(); + + void RegisterServiceHandler(int cmd, HeterServiceHandler func); + + void StartXpuService(const std::string& ip, uint32_t port); + + void CallRemoteXpu(std::shared_ptr task, HeterCpuWorker* worker, + int mpi_rank, std::vector& send_vars); + + void CallRemoteXpuSync(std::shared_ptr task, + HeterCpuWorker* worker, int mpi_rank, + std::vector& send_vars); + + void StopXpuService(int num); + + void EndPass(Scope* scope, int num); + + void SerializeToReq(const std::string& varname, Scope* scope, + VariableMessage* req_var); + + framework::proto::VarType::Type ToVarType(VariableMessage::Type type); + +#ifdef PADDLE_WITH_CUDA + void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, + platform::Place place, + cudaStream_t stream = nullptr); +#else + void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, + platform::Place place); +#endif + // HeterWrapper singleton + static std::shared_ptr GetInstance() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::framework::HeterWrapper()); + } + return s_instance_; + } + + std::vector& GetXpuList() { return xpu_list_; } + + void SetXpuList(const std::vector& xpu_list); + + private: + static std::shared_ptr s_instance_; + + protected: + std::vector> xpu_channels_; + brpc::Server server_; + HeterXpuService service_; + static bool is_initialized_; + DISABLE_COPY_AND_ASSIGN(HeterWrapper); + std::vector xpu_list_; +}; + +} // end namespace framework +} // end namespace paddle +#endif diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h new file mode 100644 index 0000000000000000000000000000000000000000..8662e460aa340eab834ecb1721fc4708c7220b29 --- /dev/null +++ b/paddle/fluid/framework/heter_service.h @@ -0,0 +1,368 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include +#include "paddle/fluid/framework/heter_service.pb.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#ifdef PADDLE_WITH_PSLIB +#include "brpc/channel.h" +#include "brpc/controller.h" +#include "brpc/server.h" + +namespace paddle { +namespace framework { + +typedef std::function + HeterServiceHandler; +class DataFeed; + +class HeterXpuService : public HeterService { + public: + HeterXpuService() {} + virtual ~HeterXpuService() {} + void service(::google::protobuf::RpcController* controller, + const HeterRequest* request, HeterResponse* response, + ::google::protobuf::Closure* done) { + brpc::ClosureGuard done_guard(done); + int ret = 0; + int cmd = request->cmd(); + auto itr = handler_map_.find(cmd); + if (itr == handler_map_.end()) { + } else { + ret = itr->second(request, response); + } + // response->set_err_code(0); + // response->set_err_msg(""); + if (ret != 0) { + // response->set_err_code(-1); + // response->set_err_msg("xpu service error"); + } + } + + void RegisterServiceHandler(int cmd, HeterServiceHandler func) { + VLOG(0) << "register heter service"; + handler_map_[cmd] = func; + } + + private: + std::unordered_map handler_map_; +}; + +enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE }; + +class HeterTask { + public: + void Update() { + if (state_ == PULL_SPARSE) { + state_ = OP_RUN; + } else if (state_ == OP_RUN) { + state_ = XPU; + // state_ = PUSH_GRAD; + // state_ = PUSH_GRAD; + } else if (state_ == XPU) { + state_ = OP_RUN_END; + } else if (state_ == OP_RUN_END) { + state_ = PUSH_GRAD; + } else if (state_ == PUSH_GRAD) { + state_ = DONE; + } + } + void Reset() { + total_time = 0; + read_time = 0; + pack_time = 0; + pull_sparse_local_time = 0; + op_all_time = 0; + xpu_op_time = 0; + xpu_wait_time = 0; + cpu_op_time = 0; + collect_label_time = 0; + fill_sparse_time = 0; + push_sparse_time = 0; + } + void Show() { + std::cout << "features size " << features_.size() << std::endl; + for (size_t i = 0; i < features_.size(); ++i) { + std::cout << "features[" << i << "] size " << features_[i].size() + << std::endl; + } + } + void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch, + const ProgramDesc& program); + + Scope* scope_{nullptr}; + int taskid_; + int cur_batch_; + HeterTaskState state_; + // cache + std::map> features_; + std::map> feature_labels_; + std::map>> feature_values_; + std::map>> feature_grads_; + std::map> sparse_push_keys_; + double total_time{0}; + double read_time{0}; + double pack_time{0}; + double pull_sparse_local_time{0}; + double op_all_time{0}; + double xpu_op_time{0}; + double xpu_wait_time{0}; + double cpu_op_time{0}; + double collect_label_time{0}; + double fill_sparse_time{0}; + double push_sparse_time{0}; +}; + +template +class HeterObjectPool { + public: + HeterObjectPool() {} + virtual ~HeterObjectPool(){}; + std::shared_ptr Get() { + std::lock_guard lock(mutex_); + if (pool_.empty()) { + num_ += 1; +#ifdef PADDLE_WITH_CUDA + VLOG(0) << "pool construct size: " << num_; +#endif + return std::make_shared(); + } else { + auto ret = pool_.back(); + pool_.pop_back(); + return ret; + } + } + void Push(std::shared_ptr data) { + std::lock_guard lock(mutex_); + pool_.push_back(std::move(data)); + } + int Size() { + std::lock_guard lock(mutex_); + return pool_.size(); + } + std::shared_ptr& GetElement(int i) { return pool_[i]; } + + private: + std::vector> pool_; + std::mutex mutex_; + int num_{0}; +}; + +struct BthreadMutextGuard { + BthreadMutextGuard(bthread_mutex_t* rho) { + mutex_ = rho; + bthread_mutex_lock(mutex_); + } + ~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); } + bthread_mutex_t* mutex_; +}; + +template +class BtObjectPool { + public: + BtObjectPool() { + bthread_mutex_init(&mutex_, NULL); + bthread_cond_init(&cond_, NULL); + } + + virtual ~BtObjectPool() { + bthread_cond_destroy(&cond_); + bthread_mutex_destroy(&mutex_); + }; + + std::shared_ptr Get() { + BthreadMutextGuard guard(&mutex_); + while (pool_.empty()) { + bthread_cond_wait(&cond_, &mutex_); + } + auto ret = pool_.back(); + pool_.pop_back(); + return ret; + } + + void Push(std::shared_ptr data) { + BthreadMutextGuard guard(&mutex_); + pool_.push_back(std::move(data)); + bthread_cond_signal(&cond_); + } + + int Size() { return pool_.size(); } + + std::shared_ptr& GetElement(int i) { return pool_[i]; } + + private: + std::vector> pool_; + bthread_mutex_t mutex_; + bthread_cond_t cond_; + int num_{0}; +}; + +template +struct HeterNode { + K key; + T value; + HeterNode* prev; + HeterNode* next; +}; + +template +class HeterList { + public: + HeterList() : head_(new HeterNode), tail_(new HeterNode) { + head_->prev = NULL; + head_->next = tail_; + tail_->prev = head_; + tail_->next = NULL; + size = 0; + cap_ = 1e9; + } + + ~HeterList() { + delete head_; + delete tail_; + } + + void SetCap(int num) { cap_ = num; } + + bool TryPut(K& key, T& value) { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this] { return size < cap_; }); + if (task_map_.find(key) != task_map_.end()) { + // std::cout << "try put key=" << key << " false" << std::endl; + task_map_.erase(key); + return false; + } else { + HeterNode* node = new HeterNode; + node->key = key; + node->value = value; + map_[node->key] = node; + attach(node); + // std::cout << "try put key=" << key << " true" << std::endl; + return true; + } + } + + bool Put(K& key, T& value) { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this] { return size < cap_; }); + HeterNode* node = new HeterNode; + // std::cout << "put key=" << key << " true" << std::endl; + node->key = key; + node->value = value; + map_[node->key] = node; + attach(node); + return true; + } + + T TryGet(const K& key) { + std::lock_guard lock(mutex_); + auto iter = map_.find(key); + if (iter != map_.end()) { + // std::cout << "try get key=" << key << " true" << std::endl; + HeterNode* node = iter->second; + detach(node); + cond_.notify_one(); + T ret = std::move(node->value); + map_.erase(key); + delete node; + return ret; + } + task_map_.insert(key); + // std::cout << "try get key=" << key << " false" << std::endl; + return nullptr; + } + + T Get(const K& key) { + std::lock_guard lock(mutex_); + auto iter = map_.find(key); + if (iter != map_.end()) { + // std::cout << "get key=" << key << " true" << std::endl; + HeterNode* node = iter->second; + detach(node); + cond_.notify_one(); + T ret = std::move(node->value); + map_.erase(key); + delete node; + return ret; + } + // std::cout << "get key=" << key << " false" << std::endl; + return nullptr; + } + + T Get() { + std::lock_guard lock(mutex_); + HeterNode* node = head_->next; + if (node == tail_) { + // std::cout << "get2 false" << std::endl; + return nullptr; + } else { + detach(node); + cond_.notify_one(); + T ret = std::move(node->value); + map_.erase(node->key); + // std::cout << "get2 key=" << node->key << " true" << std::endl; + delete node; + return ret; + } + } + + bool Empty() { + std::lock_guard lock(mutex_); + return head_->next == tail_; + } + + int Size() { + std::lock_guard lock(mutex_); + return size; + } + + private: + void detach(HeterNode* node) { + node->prev->next = node->next; + node->next->prev = node->prev; + size--; + } + + void attach(HeterNode* node) { + node->prev = head_; + node->next = head_->next; + head_->next->prev = node; + head_->next = node; + size++; + } + + private: + HeterNode* head_; + HeterNode* tail_; + std::unordered_map*> map_; + std::unordered_set task_map_; + std::mutex mutex_; + std::condition_variable cond_; + int cap_; + int size; +}; + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/heter_service.proto b/paddle/fluid/framework/heter_service.proto new file mode 100644 index 0000000000000000000000000000000000000000..c8c9ec80b3fa85e503eb5c243b8933af914b1017 --- /dev/null +++ b/paddle/fluid/framework/heter_service.proto @@ -0,0 +1,69 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +syntax = "proto2"; +package paddle.framework; +option cc_generic_services = true; + +// It can be: LoDTensor态SelectedRows or NCCL_ID +enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + NCCL_ID = 2; +} + +// VariableMessage is serialized paddle variable message. +// NOTICE(gongwb):don't modify this proto if you are not +// not familar with how we serialize in sendrecvop_utils.h +// and deserilize it in variable_response.h. +message VariableMessage { + enum Type { + // Pod Types + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; + } + + message LodData { repeated int64 lod_data = 1; } + optional string varname = 1; + // TODO(Yancey1989): reference framework::proto::VarDesc::VarType + optional VarType type = 2; + // bool persistable is not needed for sending. + // tensor info: + optional Type data_type = 3; + repeated int64 dims = 4; + + // lod details: + optional int64 lod_level = 5; + repeated LodData lod = 6; + // selected_rows height, aka. original dim0 + optional int64 slr_height = 7; + // tensor data + optional bytes data = 8; +} +message HeterRequest { + required int32 cmd = 1; + optional int32 cur_batch = 2; + repeated VariableMessage vars = 3; +}; + +message HeterResponse { + // optional VariableMessage vars = 1; + repeated VariableMessage vars = 1; +}; + +service HeterService { rpc service(HeterRequest) returns (HeterResponse); }; diff --git a/paddle/fluid/framework/hetercpu_worker.cc b/paddle/fluid/framework/hetercpu_worker.cc new file mode 100644 index 0000000000000000000000000000000000000000..83838f4df67d0bcbd9fb8ec8fb6762641287d2c4 --- /dev/null +++ b/paddle/fluid/framework/hetercpu_worker.cc @@ -0,0 +1,1166 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/device_worker.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" +#include "paddle/fluid/framework/fleet/heter_wrapper.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/string/string_helper.h" + +#ifdef PADDLE_WITH_PSLIB + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + +namespace paddle { +namespace framework { + +void HeterTask::PackTask(Scope* thread_scope, int taskid, DataFeed* reader, + int cur_batch, const ProgramDesc& program) { + // total_time = 0; + // read_time = 0; + // pack_time = 0; + // pull_sparse_local_time = 0; + taskid_ = taskid; + auto& block = program.Block(0); + if (!scope_) { + scope_ = &(thread_scope->NewScope()); + for (auto& var : block.AllVars()) { + if (!var->Persistable()) { + auto* ptr = scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } + } + } + state_ = PULL_SPARSE; + cur_batch_ = cur_batch; + auto& use_slots = reader->GetUseSlotAlias(); + for (size_t i = 0; i < use_slots.size(); ++i) { + Variable* thread_var = thread_scope->FindVar(use_slots[i]); + LoDTensor* thread_tensor = thread_var->GetMutable(); + Variable* task_var = scope_->FindVar(use_slots[i]); + LoDTensor* task_tensor = task_var->GetMutable(); + TensorCopy(*thread_tensor, platform::CPUPlace(), task_tensor); + auto& tensor_lod = thread_tensor->lod()[0]; + LoD thread_lod{tensor_lod}; + task_tensor->set_lod(thread_lod); + } +} + +void HeterCpuWorker::GetXpuOpIndex() { + xpu_begin_op_index_ = trainer_desc_.xpu_start_idx(); + xpu_end_op_index_ = trainer_desc_.xpu_end_idx(); + VLOG(0) << "xpu begin: " << xpu_begin_op_index_ + << " xpu end: " << xpu_end_op_index_; + // CHECK(xpu_begin_op_index_ == trainer_desc_.xpu_start_idx()); + // CHECK(xpu_end_op_index_ == trainer_desc_.xpu_end_idx()); + // CHECK(trainer_desc_.op_run_start_idx() == 0); + // CHECK(trainer_desc_.op_run_end_idx() == xpu_begin_op_index_ - 1); + // CHECK(trainer_desc_.op_run_end_start_idx() == xpu_end_op_index_ + 1); + // CHECK(trainer_desc_.op_run_end_end_idx() == ops_.size() - 1); +} + +void HeterCpuWorker::Schedule(int taskid) { + VLOG(3) << "schedule " << taskid; + auto task = wait_queue_.TryGet(taskid); + if (task) { + run_queue_.Put(task->taskid_, task); + } +} + +void HeterCpuWorker::JumpContext(std::shared_ptr task) { + VLOG(3) << "jump context " << task->taskid_; + if (!(wait_queue_.TryPut(task->taskid_, task))) { + run_queue_.Put(task->taskid_, task); + } +} + +void HeterCpuWorker::Initialize(const TrainerDesc& desc) { + param_ = desc.downpour_param(); + mpi_rank_ = desc.mpi_rank(); + trainer_desc_ = desc; + for (int i = 0; i < param_.sparse_table_size(); ++i) { + uint64_t table_id = + static_cast(param_.sparse_table(i).table_id()); + TableParameter table = param_.sparse_table(i); + sparse_key_names_[table_id].resize(table.sparse_key_name_size()); + for (int j = 0; j < table.sparse_key_name_size(); ++j) { + sparse_key_names_[table_id][j] = table.sparse_key_name(j); + } + sparse_value_names_[table_id].resize(table.sparse_value_name_size()); + for (int j = 0; j < table.sparse_value_name_size(); ++j) { + sparse_value_names_[table_id][j] = table.sparse_value_name(j); + } + sparse_grad_names_[table_id].resize(table.sparse_grad_name_size()); + for (int j = 0; j < table.sparse_grad_name_size(); ++j) { + sparse_grad_names_[table_id][j] = table.sparse_grad_name(j); + } + label_var_name_[table_id] = table.label_var_name(); + sparse_push_keys_[table_id] = std::vector(); + } + + for (int i = 0; i < param_.dense_table_size(); ++i) { + uint64_t table_id = static_cast(param_.dense_table(i).table_id()); + auto table = param_.dense_table(i); + dense_value_names_[table_id].resize(table.dense_value_name_size()); + for (int j = 0; j < table.dense_value_name_size(); ++j) { + dense_value_names_[table_id][j] = table.dense_value_name(j); + } + dense_grad_names_[table_id].resize(table.dense_grad_name_size()); + for (int j = 0; j < table.dense_grad_name_size(); ++j) { + dense_grad_names_[table_id][j] = table.dense_grad_name(j); + } + } + + skip_ops_.resize(param_.skip_ops_size()); + for (int i = 0; i < param_.skip_ops_size(); ++i) { + skip_ops_[i] = param_.skip_ops(i); + } + for (int i = 0; i < param_.stat_var_names_size(); ++i) { + stat_var_name_map_[param_.stat_var_names(i)] = 1; + } + + need_to_push_sparse_ = param_.push_sparse(); + need_to_push_dense_ = param_.push_dense(); + + fleet_ptr_ = FleetWrapper::GetInstance(); + heter_ptr_ = HeterWrapper::GetInstance(); + fetch_config_ = desc.fetch_config(); + use_cvm_ = desc.use_cvm(); + // for sparse value accessor, embedding only + no_cvm_ = desc.no_cvm(); + scale_datanorm_ = desc.scale_datanorm(); + dump_slot_ = desc.dump_slot(); + dump_fields_.resize(desc.dump_fields_size()); + for (int i = 0; i < desc.dump_fields_size(); ++i) { + dump_fields_[i] = desc.dump_fields(i); + } + adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); + need_dump_param_ = false; + dump_param_.resize(desc.dump_param_size()); + for (int i = 0; i < desc.dump_param_size(); ++i) { + dump_param_[i] = desc.dump_param(i); + } + if (desc.dump_param_size() != 0) { + need_dump_param_ = true; + } + for (int i = 0; i < desc.check_nan_var_names_size(); ++i) { + check_nan_var_names_.push_back(desc.check_nan_var_names(i)); + } + copy_table_config_ = desc.copy_table_config(); + for (int i = 0; i < copy_table_config_.src_sparse_tables_size(); ++i) { + uint64_t src_table = copy_table_config_.src_sparse_tables(i); + uint64_t dest_table = copy_table_config_.dest_sparse_tables(i); + VLOG(3) << "copy_sparse_tables_ push back " << src_table << "->" + << dest_table; + copy_sparse_tables_.push_back(std::make_pair(src_table, dest_table)); + } + for (int i = 0; i < copy_table_config_.src_dense_tables_size(); ++i) { + uint64_t src_table = copy_table_config_.src_dense_tables(i); + uint64_t dest_table = copy_table_config_.dest_dense_tables(i); + VLOG(3) << "copy_dense_tables_ push back " << src_table << "->" + << dest_table; + copy_dense_tables_.push_back(std::make_pair(src_table, dest_table)); + } + for (auto& m : copy_table_config_.table_denpendency_map()) { + if (sparse_key_names_.find(m.key()) != sparse_key_names_.end()) { + // currently only support one dependency + for (auto& value : m.values()) { + table_dependency_[m.key()] = value; + } + } + } +} + +void HeterCpuWorker::SetChannelWriter(ChannelObject* queue) { + writer_.Reset(queue); +} + +void HeterCpuWorker::SetNeedDump(bool need_dump_field) { + need_dump_field_ = need_dump_field; +} + +// template +// std::string PrintLodTensorType(LoDTensor* tensor, +// int64_t start, int64_t end) { +// auto count = tensor->numel(); +// if (start < 0 || end > count) { +// VLOG(3) << "access violation"; +// return "access violation"; +// } +// std::ostringstream os; +// for (int64_t i = start; i < end; i++) { +// os << ":" << tensor->data()[i]; +// } +// return os.str(); +// } +// +// std::string PrintLodTensorIntType(LoDTensor* tensor, int64_t start, +// int64_t end) { +// auto count = tensor->numel(); +// if (start < 0 || end > count) { +// VLOG(3) << "access violation"; +// return "access violation"; +// } +// std::ostringstream os; +// for (int64_t i = start; i < end; i++) { +// os << ":" << static_cast(tensor->data()[i]); +// } +// return os.str(); +// } +// +// std::string PrintLodTensor(LoDTensor* tensor, int64_t start, int64_t end) { +// std::string out_val; +// if (tensor->type() == proto::VarType::FP32) { +// out_val = PrintLodTensorType(tensor, start, end); +// } else if (tensor->type() == proto::VarType::INT64) { +// out_val = PrintLodTensorIntType(tensor, start, end); +// } else if (tensor->type() == proto::VarType::FP64) { +// out_val = PrintLodTensorType(tensor, start, end); +// } else { +// out_val = "unsupported type"; +// } +// return out_val; +// } +// +// std::pair GetTensorBound(LoDTensor* tensor, int index) { +// auto& dims = tensor->dims(); +// if (tensor->lod().size() != 0) { +// auto& lod = tensor->lod()[0]; +// return {lod[index] * dims[1], lod[index + 1] * dims[1]}; +// } else { +// return {index * dims[1], (index + 1) * dims[1]}; +// } +// } +// +// bool CheckValidOutput(LoDTensor* tensor, size_t batch_size) { +// auto& dims = tensor->dims(); +// if (dims.size() != 2) return false; +// if (tensor->lod().size() != 0) { +// auto& lod = tensor->lod()[0]; +// if (lod.size() != batch_size + 1) { +// return false; +// } +// } else { +// if (dims[0] != static_cast(batch_size)) { +// return false; +// } +// } +// return true; +// } + +void HeterCpuWorker::DumpParam() { + // std::string os; + // for (auto& param : dump_param_) { + // os.clear(); + // os = param; + // Variable* var = thread_scope_->FindVar(param); + // if (var == nullptr) { + // continue; + // } + // LoDTensor* tensor = var->GetMutable(); + // int64_t len = tensor->numel(); + // os += PrintLodTensor(tensor, 0, len); + // writer_ << os; + // } +} + +void HeterCpuWorker::CollectLabelInfo(std::shared_ptr task, + size_t table_idx) { + if (no_cvm_) { + return; + } + uint64_t table_id = static_cast( + param_.program_config(0).pull_sparse_table_id(table_idx)); + + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == table_id) { + table = i; + break; + } + } + auto& feature = (task->features_)[table_id]; + auto& feature_label = (task->feature_labels_)[table_id]; + Scope* scope = task->scope_; + feature_label.resize(feature.size()); + Variable* var = scope->FindVar(label_var_name_[table_id]); + LoDTensor* tensor = var->GetMutable(); + int64_t* label_ptr = tensor->data(); + + size_t global_index = 0; + for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) { + VLOG(3) << "sparse_key_names_[" << i + << "]: " << sparse_key_names_[table_id][i]; + Variable* fea_var = scope->FindVar(sparse_key_names_[table_id][i]); + if (fea_var == nullptr) { + continue; + } + LoDTensor* tensor = fea_var->GetMutable(); + CHECK(tensor != nullptr) << "tensor of var " + << sparse_key_names_[table_id][i] << " is null"; + + // skip slots which do not have embedding + Variable* emb_var = scope->FindVar(sparse_value_names_[table_id][i]); + if (emb_var == nullptr) { + continue; + } + int64_t* ids = tensor->data(); + size_t fea_idx = 0; + // tensor->lod()[0].size() == batch_size + 1 + for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) { + for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) { + // should be skipped feasign defined in protobuf + if (ids[fea_idx] == 0u) { + continue; + } + feature_label[global_index++] = + static_cast(label_ptr[lod_idx - 1]); + } + } + } + CHECK(global_index == feature.size()) + << "expect fea info size:" << feature.size() << " real:" << global_index; +} + +void HeterCpuWorker::FillSparseValue(std::shared_ptr task, + size_t table_idx) { + uint64_t table_id = static_cast( + param_.program_config(0).pull_sparse_table_id(table_idx)); + + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == table_id) { + table = i; + break; + } + } + + auto& fea_value = (task->feature_values_)[table_id]; + Scope* scope = task->scope_; + auto fea_idx = 0u; + + std::vector init_value(table.fea_dim()); + for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) { + std::string slot_name = sparse_key_names_[table_id][i]; + std::string emb_slot_name = sparse_value_names_[table_id][i]; + Variable* var = scope->FindVar(slot_name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + CHECK(tensor != nullptr) << "tensor of var " << slot_name << " is null"; + int64_t* ids = tensor->data(); + int len = tensor->numel(); + Variable* var_emb = scope->FindVar(emb_slot_name); + if (var_emb == nullptr) { + continue; + } + LoDTensor* tensor_emb = var_emb->GetMutable(); + float* ptr = + tensor_emb->mutable_data({len, table.emb_dim()}, place_); + // memset(ptr, 0, sizeof(float) * len * table.emb_dim()); + auto& tensor_lod = tensor->lod()[0]; + LoD data_lod{tensor_lod}; + tensor_emb->set_lod(data_lod); + + bool is_nid = (adjust_ins_weight_config_.need_adjust() && + adjust_ins_weight_config_.nid_slot() == emb_slot_name); + if (is_nid) { + nid_show_.clear(); + } + int nid_ins_index = 0; + + for (int index = 0; index < len; ++index) { + if (use_cvm_ || no_cvm_) { + if (ids[index] == 0u) { + memcpy(ptr + table.emb_dim() * index, init_value.data(), + sizeof(float) * table.emb_dim()); + if (is_nid) { + nid_show_.push_back(-1); + ++nid_ins_index; + } + continue; + } + memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data(), + sizeof(float) * table.emb_dim()); + if (is_nid && + static_cast(index) == tensor->lod()[0][nid_ins_index]) { + nid_show_.push_back(fea_value[fea_idx][0]); + ++nid_ins_index; + } + fea_idx++; + } else { + if (ids[index] == 0u) { + memcpy(ptr + table.emb_dim() * index, init_value.data() + 2, + sizeof(float) * table.emb_dim()); + if (is_nid) { + nid_show_.push_back(-1); + ++nid_ins_index; + } + continue; + } + memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data() + 2, + sizeof(float) * table.emb_dim()); + if (is_nid && + static_cast(index) == tensor->lod()[0][nid_ins_index]) { + nid_show_.push_back(fea_value[fea_idx][0]); + ++nid_ins_index; + } + fea_idx++; + } + } + } +} + +void HeterCpuWorker::AdjustInsWeight(std::shared_ptr task) { +#ifdef _LINUX + // check var and tensor not null + Scope* scope = task->scope_; + if (!adjust_ins_weight_config_.need_adjust()) { + VLOG(0) << "need_adjust=false, skip adjust ins weight"; + return; + } + Variable* nid_var = scope->FindVar(adjust_ins_weight_config_.nid_slot()); + if (nid_var == nullptr) { + VLOG(0) << "nid slot var " << adjust_ins_weight_config_.nid_slot() + << " is nullptr, skip adjust ins weight"; + return; + } + LoDTensor* nid_tensor = nid_var->GetMutable(); + if (nid_tensor == nullptr) { + VLOG(0) << "tensor of nid slot var " << adjust_ins_weight_config_.nid_slot() + << " is nullptr, skip adjust ins weight"; + return; + } + Variable* ins_weight_var = + scope->FindVar(adjust_ins_weight_config_.ins_weight_slot()); + if (ins_weight_var == nullptr) { + VLOG(0) << "ins weight var " << adjust_ins_weight_config_.ins_weight_slot() + << " is nullptr, skip adjust ins weight"; + return; + } + LoDTensor* ins_weight_tensor = ins_weight_var->GetMutable(); + if (ins_weight_tensor == nullptr) { + VLOG(0) << "tensor of ins weight tensor " + << adjust_ins_weight_config_.ins_weight_slot() + << " is nullptr, skip adjust ins weight"; + return; + } + + float* ins_weights = ins_weight_tensor->data(); + size_t len = ins_weight_tensor->numel(); // len = batch size + // here we assume nid_show slot only has one feasign in each instance + CHECK(len == nid_show_.size()) << "ins_weight size should be equal to " + << "nid_show size, " << len << " vs " + << nid_show_.size(); + float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold(); + float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio(); + int64_t nid_adjw_num = 0; + double nid_adjw_weight = 0.0; + size_t ins_index = 0; + for (size_t i = 0; i < len; ++i) { + float nid_show = nid_show_[i]; + VLOG(3) << "nid_show " << nid_show; + if (nid_show < 0) { + VLOG(3) << "nid_show < 0, continue"; + continue; + } + float ins_weight = 1.0; + if (nid_show >= 0 && nid_show < nid_adjw_threshold) { + ins_weight = log(M_E + + (nid_adjw_threshold - nid_show) / nid_adjw_threshold * + nid_adjw_ratio); + // count nid adjw insnum and weight + ++nid_adjw_num; + nid_adjw_weight += ins_weight; + // choose large ins weight + VLOG(3) << "ins weight new " << ins_weight << ", ins weight origin " + << ins_weights[ins_index]; + if (ins_weight > ins_weights[ins_index]) { + VLOG(3) << "ins " << ins_index << " weight changes to " << ins_weight; + ins_weights[ins_index] = ins_weight; + } + ++ins_index; + } + } + VLOG(3) << "nid adjw info: total_adjw_num: " << nid_adjw_num + << ", avg_adjw_weight: " << nid_adjw_weight; +#endif +} + +void HeterCpuWorker::CopySparseTable() { + for (size_t i = 0; i < copy_sparse_tables_.size(); ++i) { + int64_t src_table = copy_sparse_tables_[i].first; + int64_t dest_table = copy_sparse_tables_[i].second; + int32_t feanum = 0; + if (src_table == dest_table) { + continue; + } else if (!copy_table_config_.sparse_copy_by_feasign()) { + if (feasign_set_.find(src_table) == feasign_set_.end()) { + continue; + } else if (feasign_set_[src_table].size() == 0) { + continue; + } + feanum = fleet_ptr_->CopyTable(src_table, dest_table); + } else { + std::vector fea_vec(feasign_set_[src_table].begin(), + feasign_set_[src_table].end()); + feanum = fleet_ptr_->CopyTableByFeasign(src_table, dest_table, fea_vec); + fea_vec.clear(); + std::vector().swap(fea_vec); + } + VLOG(3) << "copy feasign from table " << src_table << " to table " + << dest_table << ", feasign num=" << feanum; + feasign_set_[src_table].clear(); + std::unordered_set().swap(feasign_set_[src_table]); + } + feasign_set_.clear(); +} + +void HeterCpuWorker::CopyDenseTable() { + if (thread_id_ != 0) { + return; + } + thread_local std::vector> pull_dense_status; + for (size_t i = 0; i < copy_dense_tables_.size(); ++i) { + uint64_t src_table = copy_dense_tables_[i].first; + uint64_t dest_table = copy_dense_tables_[i].second; + if (src_table == dest_table) { + continue; + } + int32_t dim = fleet_ptr_->CopyTable(src_table, dest_table); + VLOG(3) << "copy param from table " << src_table << " to table " + << dest_table << ", dim=" << dim; + if (copy_table_config_.dense_pull_after_copy()) { + VLOG(3) << "dense pull after copy, table=" << dest_table; + pull_dense_status.resize(0); + // fleet_ptr_->PullDenseVarsAsync(*root_scope_, dest_table, + // dense_value_names_[dest_table], + // &pull_dense_status); + for (auto& t : pull_dense_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(WARNING) << "pull dense after copy table failed," + << " table=" << dest_table; + } + } + } + } +} + +void HeterCpuWorker::CopyDenseVars() { + if (thread_id_ != 0) { + return; + } + for (int i = 0; i < copy_table_config_.src_var_list_size(); ++i) { + auto& src_var_name = copy_table_config_.src_var_list(i); + auto& dest_var_name = copy_table_config_.dest_var_list(i); + if (src_var_name == dest_var_name) { + continue; + } + VLOG(3) << "copy dense var from " << src_var_name << " to " + << dest_var_name; + Variable* src_var = thread_scope_->FindVar(src_var_name); + CHECK(src_var != nullptr) << src_var_name << " not found"; // NOLINT + LoDTensor* src_tensor = src_var->GetMutable(); + CHECK(src_tensor != nullptr) << src_var_name + << " tensor is null"; // NOLINT + float* src_data = src_tensor->data(); + + Variable* dest_var = thread_scope_->FindVar(dest_var_name); + CHECK(dest_var != nullptr) << dest_var_name << " not found"; // NOLINT + LoDTensor* dest_tensor = dest_var->GetMutable(); + CHECK(dest_tensor != nullptr) << dest_var_name + << " tensor is null"; // NOLINT + float* dest_data = dest_tensor->data(); + + CHECK(src_tensor->numel() == dest_tensor->numel()) + << "tensor numel not equal," << src_tensor->numel() << " vs " + << dest_tensor->numel(); + for (int i = 0; i < src_tensor->numel(); i++) { + dest_data[i] = src_data[i]; + } + } +} + +void HeterCpuWorker::TrainFilesWithProfiler() { + VLOG(3) << "Begin to train files with profiler"; + platform::SetNumThreads(1); + device_reader_->Start(); + + std::vector op_total_time; + std::vector op_name; + for (auto& op : ops_) { + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op_name.push_back(op->Type()); + } + } + + VLOG(3) << "op name size: " << op_name.size(); + op_total_time.resize(op_name.size()); + for (size_t i = 0; i < op_total_time.size(); ++i) { + op_total_time[i] = 0.0; + } + platform::Timer timeline; + double total_time = 0.0; + double read_time = 0.0; + double pack_time = 0.0; + double pull_sparse_local_time = 0.0; + double op_all_time = 0; + double xpu_op_time = 0; + double xpu_wait_time = 0; + double cpu_op_time = 0; + double collect_label_time = 0; + double fill_sparse_time = 0; + double push_sparse_time = 0; + + int batch_cnt = 0; + int done_cnt = 0; + int cur_batch; + uint64_t total_inst = 0; + wait_queue_.SetCap(1); + while (1) { + std::shared_ptr task; + task = run_queue_.Get(); + if (!task) { + double tmp_read_time; + timeline.Start(); + cur_batch = device_reader_->Next(); + timeline.Pause(); + tmp_read_time = timeline.ElapsedSec(); + if (cur_batch <= 0) { + if (batch_cnt == done_cnt) { + break; + } else { + continue; + } + } + batch_cnt += 1; + int taskid = batch_cnt * worker_num_ + thread_id_; + timeline.Start(); + task = object_pool_.Get(); + task->Reset(); + task->PackTask(thread_scope_, taskid, device_reader_, cur_batch, + program_); + timeline.Pause(); + task->read_time = tmp_read_time; + task->pack_time = timeline.ElapsedSec(); + task->total_time = tmp_read_time + task->pack_time; + } + for (;;) { + // pull sparse here + if (task->state_ == PULL_SPARSE) { + timeline.Start(); + for (int i = 0; + i < param_.program_config(0).pull_sparse_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).pull_sparse_table_id(i)); + TableParameter table; + for (auto j : param_.sparse_table()) { + if (j.table_id() == tid) { + table = j; + break; + } + } + fleet_ptr_->HeterPullSparseVars( + thread_id_, task, tid, sparse_key_names_[tid], table.fea_dim(), + sparse_value_names_[tid]); + } + task->Update(); + // JumpContext(task); + timeline.Pause(); + task->pull_sparse_local_time += timeline.ElapsedSec(); + task->total_time += timeline.ElapsedSec(); + } else if (task->state_ == OP_RUN) { + // total_time += task->total_time; + // read_time += task->read_time; + // pack_time += task->pack_time; + // pull_sparse_local_time += task->pull_sparse_local_time; + for (int i = 0; + i < param_.program_config(0).pull_sparse_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).pull_sparse_table_id(i)); + timeline.Start(); + CollectLabelInfo(task, i); + timeline.Pause(); + task->collect_label_time += timeline.ElapsedSec(); + task->total_time += timeline.ElapsedSec(); + timeline.Start(); + FillSparseValue(task, i); + timeline.Pause(); + task->fill_sparse_time += timeline.ElapsedSec(); + task->total_time += timeline.ElapsedSec(); + + auto nid_iter = std::find(sparse_value_names_[tid].begin(), + sparse_value_names_[tid].end(), + adjust_ins_weight_config_.nid_slot()); + if (nid_iter != sparse_value_names_[tid].end()) { + AdjustInsWeight(task); + } + } + + VLOG(3) << "fill sparse value for all sparse table done."; + // do computation here + // int run_op_idx = 0; + timeline.Start(); + for (int i = 0; i < xpu_begin_op_index_; ++i) { + auto& op = ops_[i]; + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + // timeline.Start(); + op->Run(*(task->scope_), place_); + // timeline.Pause(); + // op_total_time[run_op_idx++] += timeline.ElapsedSec(); + // total_time += timeline.ElapsedSec(); + } + } + task->Update(); + timeline.Pause(); + task->cpu_op_time += timeline.ElapsedSec(); + task->total_time += timeline.ElapsedSec(); + } else if (task->state_ == XPU) { + timeline.Start(); + VLOG(3) << "call remote xpu taskid = " << task->taskid_; + std::vector send_var_list; + for (int i = 0; i < trainer_desc_.xpu_recv_list_size(); ++i) { + send_var_list.push_back(trainer_desc_.xpu_recv_list(i)); + } + heter_ptr_->CallRemoteXpu(task, this, mpi_rank_, send_var_list); + timeline.Pause(); + task->xpu_op_time += timeline.ElapsedSec(); + task->total_time += timeline.ElapsedSec(); + task->Update(); + timeline.Start(); + JumpContext(task); + timeline.Pause(); + task->xpu_wait_time += timeline.ElapsedSec(); + task->total_time += timeline.ElapsedSec(); + break; + } else if (task->state_ == OP_RUN_END) { + timeline.Start(); + for (size_t i = xpu_end_op_index_ + 1; i < ops_.size(); ++i) { + auto& op = ops_[i]; + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*(task->scope_), place_); + } + } + // check inf and nan + for (std::string& var_name : check_nan_var_names_) { + Variable* var = (task->scope_)->FindVar(var_name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + if (tensor == nullptr) { + continue; + } + } + task->Update(); + timeline.Pause(); + task->cpu_op_time += timeline.ElapsedSec(); + task->total_time += timeline.ElapsedSec(); + } else if (task->state_ == PUSH_GRAD) { + if (need_to_push_sparse_) { + // push gradients here + for (int i = 0; + i < param_.program_config(0).push_sparse_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_sparse_table_id(i)); + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == tid) { + table = i; + break; + } + } + timeline.Start(); + fleet_ptr_->HeterPushSparseVars( + task, tid, sparse_key_names_[tid], sparse_grad_names_[tid], + table.emb_dim(), &push_sparse_status_, use_cvm_, dump_slot_, + no_cvm_); + timeline.Pause(); + task->push_sparse_time += timeline.ElapsedSec(); + task->total_time += timeline.ElapsedSec(); + } + } + + if (need_to_push_sparse_) { + VLOG(3) << "push sparse gradient done."; + int32_t tmp_push_sparse_wait_times = -1; + static uint32_t push_sparse_wait_times = + static_cast(tmp_push_sparse_wait_times); + if (push_sparse_status_.size() >= push_sparse_wait_times) { + for (auto& t : push_sparse_status_) { + t.wait(); + } + push_sparse_status_.resize(0); + } + + if (tmp_push_sparse_wait_times == -1) { + push_sparse_status_.resize(0); + } + } + + // thread_scope_->DropKids(); + task->Update(); + } else if (task->state_ == DONE) { + PrintFetchVars(); + ++done_cnt; + total_inst += task->cur_batch_; + object_pool_.Push(task); + + total_time += task->total_time; + read_time += task->read_time; + pack_time += task->pack_time; + pull_sparse_local_time += task->pull_sparse_local_time; + op_all_time += task->op_all_time; + xpu_op_time += task->xpu_op_time; + xpu_wait_time += task->xpu_wait_time; + cpu_op_time += task->cpu_op_time; + collect_label_time += task->collect_label_time; + fill_sparse_time += task->fill_sparse_time; + push_sparse_time += task->push_sparse_time; + // ++batch_cnt; + if (thread_id_ == 0) { + // should be configured here + if (done_cnt > 0 && done_cnt % 100 == 0) { + // double op_sum_time = 0; + // std::unordered_map op_to_time; + // for (size_t i = 0; i < op_total_time.size(); ++i) { + // fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i, + // op_name[i].c_str(), op_total_time[i] / done_cnt); + // if (op_to_time.find(op_name[i]) == op_to_time.end()) { + // op_to_time[op_name[i]] = 0.0; + // } + // op_to_time[op_name[i]] += op_total_time[i]; + // op_sum_time += op_total_time[i]; + // } + // for (auto& i : op_to_time) { + // fprintf(stderr, "op [%s] run total time: [%f]ms\n", + // i.first.c_str(), + // i.second / done_cnt); + // } + fprintf(stderr, "cpu op run total time: %fs\n", + cpu_op_time / done_cnt); + fprintf(stderr, "xpu op run total time: %fs\n", + xpu_op_time / done_cnt); + fprintf(stderr, "xpu wait total time: %fs\n", + xpu_wait_time / done_cnt); + fprintf(stderr, "pack task time: %fs\n", pack_time / done_cnt); + fprintf(stderr, "train total time: %fs\n", total_time / done_cnt); + fprintf(stderr, "pull sparse local time: %fs\n", + pull_sparse_local_time / done_cnt); + fprintf(stderr, "fill sparse time: %fs\n", + fill_sparse_time / done_cnt); + fprintf(stderr, "push sparse time: %fs\n", + push_sparse_time / done_cnt); + fprintf(stderr, "collect label time: %fs\n", + collect_label_time / done_cnt); + fprintf(stderr, "mean read time: %fs\n", read_time / done_cnt); + fprintf(stderr, "IO percent: %f\n", read_time / total_time * 100); + fprintf(stderr, "cpu op run percent: %f\n", + cpu_op_time / total_time * 100); + fprintf(stderr, "xpu op run percent: %f\n", + xpu_op_time / total_time * 100); + fprintf(stderr, "xpu wait percent: %f\n", + xpu_wait_time / total_time * 100); + fprintf(stderr, "pack task percent: %f\n", + pack_time / total_time * 100); + fprintf(stderr, "pull sparse local time percent: %f\n", + pull_sparse_local_time / total_time * 100); + fprintf(stderr, "collect label time percent: %f\n", + collect_label_time / total_time * 100); + fprintf(stderr, "fill sparse time percent: %f\n", + fill_sparse_time / total_time * 100); + fprintf(stderr, "push sparse time percent: %f\n", + push_sparse_time / total_time * 100); + fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time); + } + } + break; + } + } + } + if (copy_table_config_.need_copy()) { + CopySparseTable(); + CopyDenseTable(); + CopyDenseVars(); + } +} + +void HeterCpuWorker::TrainFiles() { + VLOG(3) << "Begin to train files"; + platform::SetNumThreads(1); + device_reader_->Start(); + int batch_cnt = 0; + int done_cnt = 0; + int cur_batch; + wait_queue_.SetCap(1); + need_to_push_dense_ = false; + while (1) { + // if (copy_table_config_.need_copy()) { + // if (copy_table_config_.sparse_copy_by_feasign()) { + // for (size_t i = 0; i < copy_sparse_tables_.size(); ++i) { + // uint64_t tid = copy_sparse_tables_[i].first; + // feasign_set_[tid].insert(sparse_push_keys_[tid].begin(), + // sparse_push_keys_[tid].end()); + // } + // } + // if (batch_cnt % copy_table_config_.batch_num() == 0) { + // CopySparseTable(); + // CopyDenseTable(); + // CopyDenseVars(); + // } + // } + + std::shared_ptr task; + + task = run_queue_.Get(); + if (!task) { + cur_batch = device_reader_->Next(); + if (cur_batch <= 0) { + if (batch_cnt == done_cnt) { + break; + } else { + continue; + } + } + batch_cnt += 1; + int taskid = batch_cnt * worker_num_ + thread_id_; + task = object_pool_.Get(); + task->Reset(); + task->PackTask(thread_scope_, taskid, device_reader_, cur_batch, + program_); + } + for (;;) { + // pull sparse here + if (task->state_ == PULL_SPARSE) { + VLOG(3) << "pull sparse taskid = " << task->taskid_; + for (int i = 0; + i < param_.program_config(0).pull_sparse_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).pull_sparse_table_id(i)); + TableParameter table; + for (auto j : param_.sparse_table()) { + if (j.table_id() == tid) { + table = j; + break; + } + } + fleet_ptr_->HeterPullSparseVars( + thread_id_, task, tid, sparse_key_names_[tid], table.fea_dim(), + sparse_value_names_[tid]); + } + task->Update(); + // JumpContext(task); + // break; + } else if (task->state_ == OP_RUN) { + VLOG(3) << "oprun taskid = " << task->taskid_; + for (int i = 0; + i < param_.program_config(0).pull_sparse_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).pull_sparse_table_id(i)); + CollectLabelInfo(task, i); + FillSparseValue(task, i); + auto nid_iter = std::find(sparse_value_names_[tid].begin(), + sparse_value_names_[tid].end(), + adjust_ins_weight_config_.nid_slot()); + if (nid_iter != sparse_value_names_[tid].end()) { + AdjustInsWeight(task); + } + } + VLOG(3) << "fill sparse value for all sparse table done."; + // do computation here + for (int i = 0; i < xpu_begin_op_index_; ++i) { + auto& op = ops_[i]; + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + VLOG(3) << "run op: " << op->Type(); + op->Run(*(task->scope_), place_); + } + } + task->Update(); + } else if (task->state_ == XPU) { + VLOG(3) << "call remote xpu taskid = " << task->taskid_; + std::vector send_var_list; + for (int i = 0; i < trainer_desc_.xpu_recv_list_size(); ++i) { + send_var_list.push_back(trainer_desc_.xpu_recv_list(i)); + } + heter_ptr_->CallRemoteXpu(task, this, mpi_rank_, send_var_list); + task->Update(); + JumpContext(task); + break; + } else if (task->state_ == OP_RUN_END) { + for (size_t i = xpu_end_op_index_ + 1; i < ops_.size(); ++i) { + auto& op = ops_[i]; + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*(task->scope_), place_); + } + } + // check inf and nan + for (std::string& var_name : check_nan_var_names_) { + Variable* var = (task->scope_)->FindVar(var_name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + if (tensor == nullptr) { + continue; + } + } + task->Update(); + } else if (task->state_ == PUSH_GRAD) { + VLOG(3) << "push grad taskid = " << task->taskid_; + if (need_to_push_sparse_) { + // push gradients here + for (int i = 0; + i < param_.program_config(0).push_sparse_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_sparse_table_id(i)); + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == tid) { + table = i; + break; + } + } + fleet_ptr_->HeterPushSparseVars( + task, tid, sparse_key_names_[tid], sparse_grad_names_[tid], + table.emb_dim(), &push_sparse_status_, use_cvm_, dump_slot_, + no_cvm_); + } + } + + if (need_to_push_sparse_) { + VLOG(3) << "push sparse gradient done."; + int32_t tmp_push_sparse_wait_times = -1; + static uint32_t push_sparse_wait_times = + static_cast(tmp_push_sparse_wait_times); + if (push_sparse_status_.size() >= push_sparse_wait_times) { + for (auto& t : push_sparse_status_) { + t.wait(); + } + push_sparse_status_.resize(0); + } + + if (tmp_push_sparse_wait_times == -1) { + push_sparse_status_.resize(0); + } + } + + // if (need_dump_field_) { + // size_t batch_size = device_reader_->GetCurBatchSize(); + // std::vector ars(batch_size); + // for (auto& ar : ars) { + // ar.clear(); + // } + // auto& ins_id_vec = device_reader_->GetInsIdVec(); + // auto& ins_content_vec = device_reader_->GetInsContentVec(); + // for (size_t i = 0; i < ins_id_vec.size(); i++) { + // ars[i] += ins_id_vec[i]; + // ars[i] = ars[i] + "\t" + ins_content_vec[i]; + // } + // for (auto& field : dump_fields_) { + // Variable* var = thread_scope_->FindVar(field); + // if (var == nullptr) { + // continue; + // } + // LoDTensor* tensor = var->GetMutable(); + // if (!CheckValidOutput(tensor, batch_size)) { + // continue; + // } + // for (size_t i = 0; i < batch_size; ++i) { + // auto output_dim = tensor->dims()[1]; + // std::string output_dimstr = + // boost::lexical_cast(output_dim); + // ars[i] = ars[i] + "\t" + field + ":" + output_dimstr; + // auto bound = GetTensorBound(tensor, i); + // ars[i] += PrintLodTensor(tensor, bound.first, bound.second); + // } + // } + // // #pragma omp parallel for + // for (size_t i = 0; i < ars.size(); i++) { + // if (ars[i].length() == 0) { + // continue; + // } + // writer_ << ars[i]; + // } + // if (need_dump_param_ && thread_id_ == 0) { + // DumpParam(); + // } + // } + + // thread_scope_->DropKids(); + task->Update(); + } else if (task->state_ == DONE) { + VLOG(3) << "done taskid = " << task->taskid_; + object_pool_.Push(task); + PrintFetchVars(); + ++done_cnt; + // ++batch_cnt; + break; + } + } + } + if (need_dump_field_) { + // writer_.Flush(); + } + if (copy_table_config_.need_copy()) { + CopySparseTable(); + CopyDenseTable(); + CopyDenseVars(); + } +} + +} // end namespace framework +} // end namespace paddle +#endif diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc new file mode 100644 index 0000000000000000000000000000000000000000..5ca1aa66319228fc95f63294b15c981dd3c8ba30 --- /dev/null +++ b/paddle/fluid/framework/heterxpu_trainer.cc @@ -0,0 +1,469 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "io/fs.h" +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" +#include "paddle/fluid/framework/trainer.h" +#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) +#include "paddle/fluid/platform/cuda_device_guard.h" + +namespace paddle { +namespace framework { + +void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, + Dataset* dataset) { + srand((unsigned)time(NULL)); + param_ = trainer_desc.downpour_param(); + for (int i = 0; i < param_.dense_table_size(); ++i) { + uint64_t table_id = static_cast(param_.dense_table(i).table_id()); + auto table = param_.dense_table(i); + dense_grad_names_[table_id].resize(table.dense_grad_name_size()); + for (int j = 0; j < table.dense_grad_name_size(); ++j) { + dense_grad_names_[table_id][j] = table.dense_grad_name(j); + } + } + scale_datanorm_ = trainer_desc.scale_datanorm(); + int place_num = trainer_desc.worker_places_size(); + for (int i = 0; i < place_num; ++i) { + int num = trainer_desc.worker_places(i); + platform::CUDAPlace place = platform::CUDAPlace(num); + platform::CUDADeviceGuard guard(place.device); + cudaStream_t stream; + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); + copy_streams_.push_back(stream); + places_.push_back(place); + cudaEvent_t event; + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + events_.push_back(event); + } + // thread_num_ = trainer_desc.thread_num(); + // SetDataset(dataset); + + // dump_fields_path_ = trainer_desc.dump_fields_path(); + // dump_converter_ = trainer_desc.dump_converter(); + // need_dump_field_ = false; + // if (trainer_desc.dump_fields_size() != 0 && dump_fields_path_ != "") { + // need_dump_field_ = true; + // } + // if (need_dump_field_) { + // auto &file_list = dataset->GetFileList(); + // if (file_list.size() == 0) { + // need_dump_field_ = false; + // } + // } + // mpi_rank_ = trainer_desc.mpi_rank(); + // mpi_size_ = trainer_desc.mpi_size(); + // dump_file_num_ = trainer_desc.dump_file_num(); + // const std::vector readers = + // dataset->GetReaders(); + // thread_num_ = readers.size(); + for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size(); + i++) { + need_merge_var_names_.push_back( + trainer_desc.downpour_param().stat_var_names(i)); + } + running_ = true; + VLOG(3) << "going to initialize pull dense worker"; + pull_dense_worker_ = PullDenseWorker::GetInstance(); + pull_dense_worker_->Initialize(trainer_desc); + VLOG(3) << "initialize pull dense worker"; + SetDebug(trainer_desc.debug()); + fleet_ptr_ = FleetWrapper::GetInstance(); + heter_ptr_ = HeterWrapper::GetInstance(); + RegisterServiceHandler(); + // for (int i = 0; i < trainer_desc.worker_places_size(); ++i) { + // int num = trainer_desc.worker_places(i); + // platform::CUDAPlace place = platform::CUDAPlace(num); + // platform::CUDADeviceGuard guard(place.device); + // cudaStream_t stream; + // PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); + // copy_streams_.push_back(stream); + // places_.push_back(place); + // } + trainer_desc_ = trainer_desc; +} + +void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) { + auto place = places_[num]; + Scope* scope = place_scopes_[num]; + auto stream = copy_streams_[num]; + auto event = events_[num]; + + auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + platform::CUDADeviceGuard guard(dev_id); + auto& block = program.Block(0); + for (auto& var : block.AllVars()) { + if (var->Persistable()) { + auto name = var->Name(); + Variable* root_var = root_scope_->FindVar(name); + LoDTensor* root_tensor = root_var->GetMutable(); + auto* ptr = scope->Var(name); + InitializeVariable(ptr, proto::VarType::LOD_TENSOR); + LoDTensor* thread_tensor = ptr->GetMutable(); + +#define HeterMemcpyFunc(cpp_type, proto_type) \ + do { \ + if (root_tensor->type() == proto_type) { \ + HeterMemCpy(thread_tensor, root_tensor, place, stream); \ + } \ + } while (0) + _ForEachDataType_(HeterMemcpyFunc); + } + } + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); + cudaEventSynchronize(event); +} + +template +void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, + LoDTensor* root_tensor, + const paddle::platform::Place& thread_place, + cudaStream_t stream) { + T* thread_ptr = + thread_tensor->mutable_data(root_tensor->dims(), thread_place); + T* root_ptr = root_tensor->data(); + if (platform::is_cpu_place(root_tensor->place())) { + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr, + platform::CPUPlace(), root_ptr, + sizeof(T) * root_tensor->numel(), stream); + } else { + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr, + BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()), + root_ptr, sizeof(T) * root_tensor->numel(), stream); + } +} + +void HeterXpuTrainer::DumpWork(int tid) {} + +void HeterXpuTrainer::InitTrainerEnv(const ProgramDesc& main_program, + const platform::Place& place) { + CacheProgram(main_program); + place_ = place; + auto& profiler = paddle::ps::CostProfiler::instance(); + profiler.register_profiler("xpu_service_run_task"); + profiler.register_profiler("xpu_service_deserial"); + profiler.register_profiler("xpu_service_launch_kernel"); + profiler.register_profiler("xpu_service_wait"); +} + +void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) { + auto& block = main_program.Block(0); + pull_dense_worker_->SetRootScope(root_scope_); + pull_dense_worker_->CreatePinVar(); + for (size_t i = 0; i < places_.size(); ++i) { + Scope* scope = &(root_scope_->NewScope()); + // for (auto &var : block.AllVars()) { + // if (var->Persistable()) { + // auto *ptr = scope->Var(var->Name()); + // InitializeVariable(ptr, var->GetType()); + // } + // } + place_scopes_.push_back(scope); + CreateThreadParam(main_program, i); + pull_dense_worker_->AddThreadScope(scope); + pull_dense_worker_->AddPlace(places_[i]); + pull_dense_worker_->AddStream(copy_streams_[i]); + } + + pull_dense_worker_->Start(); + for (auto& stream : copy_streams_) { + cudaStreamSynchronize(stream); + } + op_names_.clear(); + for (auto& op_desc : block.AllOps()) { + std::unique_ptr local_op = OpRegistry::CreateOp(*op_desc); + op_names_.push_back(op_desc->Type()); + OperatorBase* local_op_ptr = local_op.release(); + ops_.push_back(local_op_ptr); + continue; + } + xpu_begin_op_index_ = xpu_end_op_index_ = -1; + xpu_begin_op_index_ = trainer_desc_.xpu_start_idx(); + xpu_end_op_index_ = trainer_desc_.xpu_end_idx(); + VLOG(0) << "xpu begin: " << xpu_begin_op_index_ + << " xpu end: " << xpu_end_op_index_; + // CHECK(xpu_begin_op_index_ == 0); + // CHECK(xpu_end_op_index_ = ops_.size() - 1); + //// init pool + for (size_t i = 0; i < 6; ++i) { + for (size_t j = 0; j < places_.size(); ++j) { + int num = j; + std::shared_ptr context = + std::make_shared(); + context->place_num_ = num; + auto place = places_[num]; + context->scope_ = &(place_scopes_[num]->NewScope()); + auto& block = program_.Block(0); + for (auto& var : block.AllVars()) { + if (!var->Persistable()) { + auto* ptr = context->scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } + } + for (auto& v : dense_grad_names_) { + for (auto& name : v.second) { + auto* ptr = context->scope_->Var(name + "pin"); + InitializeVariable(ptr, proto::VarType::LOD_TENSOR); + } + } + for (auto& op_desc : block.AllOps()) { + std::unique_ptr local_op = OpRegistry::CreateOp(*op_desc); + OperatorBase* local_op_ptr = local_op.release(); + (context->ops_).push_back(local_op_ptr); + } + auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + platform::CUDADeviceGuard guard(dev_id); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); + object_pool_.Push(context); + } + } + VLOG(3) << "init other env done."; +} + +void HeterXpuTrainer::Run() {} + +int HeterXpuTrainer::EndPass(const HeterRequest* request, + HeterResponse* response) { + // int scope_num = object_pool_.Size(); + for (size_t i = 0; i < need_merge_var_names_.size(); i++) { + Variable* root_var = root_scope_->FindVar(need_merge_var_names_[i]); + if (root_var == nullptr) { + continue; + } + LoDTensor* root_tensor = root_var->GetMutable(); + + for (size_t j = 0; j < place_scopes_.size(); j++) { + Scope* cur_thread_scope = place_scopes_[j]; + Variable* thread_var = + cur_thread_scope->FindVar(need_merge_var_names_[i]); + if (thread_var == nullptr) { + continue; + } + LoDTensor* thread_tensor = thread_var->GetMutable(); +// if (root_tensor->numel() != thread_tensor->numel()) { +// continue; +// } +#define MergeCallback(cpp_type, proto_type) \ + do { \ + if (root_tensor->type() == proto_type) { \ + if (thread_tensor->type() != proto_type) { \ + VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \ + << "] " << need_merge_var_names_[i] \ + << ", root tensor type=" << root_tensor->type() \ + << ", thread tensor type=" << thread_tensor->type(); \ + exit(-1); \ + } \ + MergeToRootScope(root_tensor, thread_tensor); \ + } \ + } while (0) + _ForEachDataType_(MergeCallback); + if (platform::is_gpu_place(thread_tensor->place())) { + auto dev_id = + BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device; + platform::CUDADeviceGuard guard(dev_id); + cudaMemset(thread_tensor->data(), 0, + thread_tensor->numel() * SizeOfType(thread_tensor->type())); + } else { + memset(thread_tensor->data(), 0, + thread_tensor->numel() * SizeOfType(thread_tensor->type())); + } + } + auto* merge_var = response->add_vars(); + heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_, + merge_var); + if (platform::is_gpu_place(root_tensor->place())) { + auto dev_id = + BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device; + platform::CUDADeviceGuard guard(dev_id); + cudaMemset(root_tensor->data(), 0, + root_tensor->numel() * SizeOfType(root_tensor->type())); + } else { + memset(root_tensor->data(), 0, + root_tensor->numel() * SizeOfType(root_tensor->type())); + } + } + return 0; +} + +template +void HeterXpuTrainer::MergeToRootScope(LoDTensor* root_tensor, + LoDTensor* tensor) { + LoDTensor tmp_root; + TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root); + T* tmp_root_data = tmp_root.data(); + LoDTensor tmp_tensor; + TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor); + T* data = tmp_tensor.data(); + for (int i = 0; i < tmp_tensor.numel(); i++) { + tmp_root_data[i] += data[i]; + } + TensorCopy(tmp_root, root_tensor->place(), root_tensor); +} + +int HeterXpuTrainer::StopService(const HeterRequest* request, + HeterResponse* response) { + std::unique_lock lock(mutex_); + running_ = false; + cond_.notify_one(); + return 0; +} + +int HeterXpuTrainer::RunTask(const HeterRequest* request, + HeterResponse* response) { + auto timer = std::make_shared("xpu_service_run_task"); + std::shared_ptr context = object_pool_.Get(); + + if (!context->scope_) { + int num = rand() % places_.size(); + context->place_num_ = num; + auto place = places_[num]; + context->scope_ = &(place_scopes_[num]->NewScope()); + auto& block = program_.Block(0); + for (auto& var : block.AllVars()) { + if (!var->Persistable()) { + auto* ptr = context->scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } + } + for (auto& v : dense_grad_names_) { + for (auto& name : v.second) { + auto* ptr = context->scope_->Var(name + "pin"); + InitializeVariable(ptr, proto::VarType::LOD_TENSOR); + } + } + for (auto& op_desc : block.AllOps()) { + std::unique_ptr local_op = OpRegistry::CreateOp(*op_desc); + OperatorBase* local_op_ptr = local_op.release(); + (context->ops_).push_back(local_op_ptr); + } + + auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; + platform::CUDADeviceGuard guard(dev_id); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); + } + + context->Reset(); + auto place = places_[context->place_num_]; + { + auto deserial_timer = + std::make_shared("xpu_service_deserial"); + for (int i = 0; i < request->vars_size(); ++i) { + heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place, + copy_streams_[context->place_num_]); + } + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventRecord(context->event_, copy_streams_[context->place_num_])); + while (cudaEventQuery(context->event_) != cudaSuccess) { + VLOG(3) << "wait for kernel"; + bthread_yield(); + } + } + + { + auto launch_timer = + std::make_shared("xpu_service_launch_kernel"); + for (int i = xpu_begin_op_index_; i <= xpu_end_op_index_; ++i) { + auto& op = (context->ops_)[i]; + op->Run(*(context->scope_), place); + } + } + auto* dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventRecord(context->event_, dev_ctx->stream())); + // cudaEventSynchronize(context->event_); + { + auto wait_timer = + std::make_shared("xpu_service_wait"); + while (cudaEventQuery(context->event_) != cudaSuccess) { + VLOG(3) << "wait for kernel"; + bthread_yield(); + } + } + + for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) { + const std::string& varname = trainer_desc_.xpu_send_list(i); + // CHECK(varname == "concat_1.tmp_0@GRAD"); + auto* res_var = response->add_vars(); + heter_ptr_->SerializeToReq(varname, context->scope_, res_var); + } + + // std::string varname = "concat_1.tmp_0@GRAD"; + // + // auto* res_var = response->add_vars(); + // heter_ptr_->SerializeToReq(varname, context->scope_, res_var); + for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); + ++i) { + uint64_t tid = + static_cast(param_.program_config(0).push_dense_table_id(i)); + fleet_ptr_->PushDenseVarsAsync( + *(context->scope_), tid, dense_grad_names_[tid], + &(context->push_dense_status_), scale_datanorm_, request->cur_batch(), + places_[context->place_num_], copy_streams_[context->place_num_], + context->event_); + } + for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); + ++i) { + uint64_t tid = + static_cast(param_.program_config(0).push_dense_table_id(i)); + pull_dense_worker_->IncreaseThreadVersion(0, tid); + } + VLOG(3) << "push dense gradient done."; + context->scope_->DropKids(); + object_pool_.Push(context); + VLOG(0) << "pool size " << object_pool_.Size(); + return 0; +} + +void HeterXpuTrainer::RegisterServiceHandler() { + heter_ptr_->RegisterServiceHandler( + 0, [this](const HeterRequest* request, HeterResponse* response) -> int { + return this->RunTask(request, response); + }); + heter_ptr_->RegisterServiceHandler( + 1, [this](const HeterRequest* request, HeterResponse* response) -> int { + return this->EndPass(request, response); + }); + heter_ptr_->RegisterServiceHandler( + 2, [this](const HeterRequest* request, HeterResponse* response) -> int { + return this->StopService(request, response); + }); +} + +Scope* HeterXpuTrainer::GetWorkerScope(int thread_id) { return nullptr; } + +void HeterXpuTrainer::Finalize() { + // for (auto &th : threads_) { + // th.join(); + // } + std::unique_lock lock(mutex_); + cond_.wait(lock, [this] { return !running_; }); + sleep(3); + pull_dense_worker_->Stop(); + root_scope_->DropKids(); +} + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index 4ae26903e66c521f26eb3514622f03f7338c64e1..030e80c0b3fa12ea2dd8f0dcc676a42ef68db3ea 100644 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -102,6 +102,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, workers_[i]->SetRootScope(root_scope_); workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->BindingDataFeedMemory(); + workers_[i]->CacheProgram(main_program); } } diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc index d66e1d8062e0b3fa91e9e3de3290806c4397bff2..9f4c817db7d81ff6d8a9afb9d5cee7b4f1dd0ed2 100644 --- a/paddle/fluid/framework/pull_dense_worker.cc +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -56,6 +56,34 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { current_version_[tid] = 0; } fleet_ptr_ = FleetWrapper::GetInstance(); +#ifdef PADDLE_WITH_CUDA + copy_streams_.clear(); + places_.clear(); + thread_scopes_.clear(); +#endif +} + +void PullDenseWorker::CreatePinVar() { +#ifdef PADDLE_WITH_CUDA + // for (auto& v : dense_value_names_) { + // for (auto& name : v.second) { + for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size(); + ++i) { + uint64_t tid = static_cast( + dwp_param_.program_config(0).pull_dense_table_id(i)); + for (size_t j = 0; j < dense_value_names_[tid].size(); j++) { + auto& name = dense_value_names_[tid][j]; + Variable* var = root_scope_->FindVar(name); + + LoDTensor* tensor = var->GetMutable(); + auto* ptr = root_scope_->Var(name + "pin"); + InitializeVariable(ptr, proto::VarType::LOD_TENSOR); + LoDTensor* pin_tensor = ptr->GetMutable(); + pin_tensor->mutable_data(tensor->dims(), + platform::CUDAPinnedPlace()); + } + } +#endif } void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { @@ -75,6 +103,31 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { exit(-1); } status_vec->resize(0); +#ifdef PADDLE_WITH_CUDA + + for (size_t i = 0; i < places_.size(); ++i) { + // for (auto& v : dense_value_names_) { + // for (auto& name : v.second) { + for (int x = 0; x < dwp_param_.program_config(0).pull_dense_table_id_size(); + ++x) { + uint64_t tid = static_cast( + dwp_param_.program_config(0).pull_dense_table_id(x)); + for (size_t j = 0; j < dense_value_names_[tid].size(); j++) { + auto& name = dense_value_names_[tid][j]; + + Variable* pin_var = root_scope_->FindVar(name + "pin"); + LoDTensor* pin_tensor = pin_var->GetMutable(); + float* pin_w = pin_tensor->data(); + Variable* var = thread_scopes_[i]->FindVar(name); + LoDTensor* tensor = var->GetMutable(); + float* w = tensor->data(); + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w, + platform::CUDAPinnedPlace(), pin_w, + sizeof(float) * tensor->numel(), copy_streams_[i]); + } + } + } +#endif } void PullDenseWorker::Stop() { @@ -91,8 +144,14 @@ void PullDenseWorker::PullDense(bool force_update) { uint64_t tid = static_cast( dwp_param_.program_config(0).pull_dense_table_id(i)); if (force_update || CheckUpdateParam(tid)) { +#ifdef PADDLE_WITH_CUDA + VLOG(3) << "pull dense " << force_update << " " << tid; fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid], - &pull_dense_status_); + &pull_dense_status_, false); +#else + fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid], + &pull_dense_status_, true); +#endif ResetThreadVersion(tid); } } diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index bb56b3ea3d251d53d6e8e494ec1c658574c2e96c..077fe75172022c8fe501bd1143895115298417bf 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -21,9 +21,12 @@ limitations under the License. */ #include // NOLINT #include +#include #include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker.h" +#include "paddle/fluid/framework/fleet/heter_wrapper.h" +#include "paddle/fluid/framework/heter_service.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/reader.h" @@ -62,6 +65,7 @@ class TrainerBase { Scope* root_scope_; bool debug_; Dataset* dataset_ptr_; + TrainerDesc trainer_desc_; // For dump param or field bool need_dump_field_ = false; @@ -118,10 +122,86 @@ class DistMultiTrainer : public MultiTrainer { void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); virtual void InitDumpEnv(); virtual Scope* GetWorkerScope(int thread_id); + virtual void RegisterHeterCallback(); + + protected: + std::shared_ptr pull_dense_worker_; +}; + +#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) +class HeterServiceContext { + public: + HeterServiceContext() {} + virtual ~HeterServiceContext() { + for (OperatorBase* op : ops_) { + delete op; + } + std::vector().swap(ops_); + } + void Reset() { push_dense_status_.clear(); } + int place_num_; + Scope* scope_{nullptr}; + cudaEvent_t event_; + std::vector ops_; + std::vector<::std::future> push_dense_status_; +}; + +class HeterXpuTrainer : public TrainerBase { + public: + HeterXpuTrainer() {} + virtual ~HeterXpuTrainer() { + for (OperatorBase* op : ops_) { + delete op; + } + std::vector().swap(ops_); + } + virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set); + virtual void InitTrainerEnv(const ProgramDesc& main_program, + const platform::Place& place); + virtual void InitOtherEnv(const ProgramDesc& main_program); + virtual void Run(); + virtual void Finalize(); + virtual void DumpWork(int tid); + virtual void RegisterServiceHandler(); + virtual int RunTask(const HeterRequest* request, HeterResponse* response); + virtual Scope* GetWorkerScope(int thread_id); + virtual void CacheProgram(const ProgramDesc& main_program) { + new (&program_) ProgramDesc(main_program); + } + template + void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor, + const paddle::platform::Place& thread_place, + cudaStream_t stream); + void CreateThreadParam(const ProgramDesc& program, int num); + template + void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); + int EndPass(const HeterRequest* request, HeterResponse* response); + int StopService(const HeterRequest* request, HeterResponse* response); protected: + DownpourWorkerParameter param_; + std::map> dense_grad_names_; + std::vector need_merge_var_names_; + float scale_datanorm_; + int xpu_begin_op_index_; + int xpu_end_op_index_; + bool running_; + paddle::platform::Place place_; + std::mutex mutex_; + ProgramDesc program_; + std::condition_variable cond_; + std::shared_ptr fleet_ptr_; + std::shared_ptr heter_ptr_; std::shared_ptr pull_dense_worker_; + std::vector ops_; + std::vector op_names_; + std::vector place_scopes_; + BtObjectPool object_pool_; + std::vector copy_streams_; + std::vector places_; + std::vector events_; }; +#endif #if defined(PADDLE_WITH_NCCL) class PipelineTrainer : public TrainerBase { diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 670ae074c7c7f0e3bcd91e157ba7b01b48d3b7ee..1985742fc4aa6a0fc67f552f2b69902840a00d0f 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -52,6 +52,12 @@ message TrainerDesc { optional bool enable_random_dump = 24 [ default = false ]; optional bool random_with_lineid = 25 [ default = false ]; optional int32 dump_interval = 26 [ default = 10000 ]; + repeated int32 worker_places = 27; + + repeated string xpu_send_list = 28; + repeated string xpu_recv_list = 29; + optional int32 xpu_start_idx = 30; + optional int32 xpu_end_idx = 31; // device worker parameters optional HogwildWorkerParameter hogwild_param = 101; diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index 23cfa11d4c9b2ef8542cd318970a58cad84e662d..31ac11e78cff15f12660b84b96a007690aa77ae3 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -63,6 +63,9 @@ std::shared_ptr TrainerFactory::CreateTrainer( REGISTER_TRAINER_CLASS(MultiTrainer); REGISTER_TRAINER_CLASS(DistMultiTrainer); +#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) +REGISTER_TRAINER_CLASS(HeterXpuTrainer); +#endif #if defined(PADDLE_WITH_NCCL) REGISTER_TRAINER_CLASS(PipelineTrainer); #endif diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index aff5f49c73bdce9a5899ed8be17419b8301d52c0..d23beea7e4e62ee65f31c4dc903d80310ddfccbc 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -241,6 +241,156 @@ class Flatten2GradOp : public framework::OperatorWithKernel { } }; +class FlattenContiguousRangeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FlattenContiguousRange"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "FlattenContiguousRange"); + const auto &start_axis = ctx->Attrs().Get("start_axis"); + const auto &stop_axis = ctx->Attrs().Get("stop_axis"); + const auto &in_dims = ctx->GetInputDim("X"); + int in_dims_size = in_dims.size(); + int real_start_axis = start_axis, real_stop_axis = stop_axis; + if (start_axis < 0) { + real_start_axis = start_axis + in_dims_size; + } + if (stop_axis < 0) { + real_stop_axis = stop_axis + in_dims_size; + } + PADDLE_ENFORCE_GE( + real_stop_axis, real_start_axis, + platform::errors::InvalidArgument("The stop_axis should be greater" + "than or equal to start_axis.")); + + const auto &out_dims = + GetOutputShape(real_start_axis, real_stop_axis, in_dims); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + if (in_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", "Out"); + } + + OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2"); + std::vector xshape_dims(in_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < in_dims.size(); ++i) { + xshape_dims[i + 1] = in_dims[i]; + } + ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); + ctx->ShareLoD("X", "XShape"); + } + + static std::vector GetOutputShape(const int start_axis, + const int stop_axis, + const framework::DDim &in_dims) { + int64_t outer = 1; + std::vector out_shape; + int in_dims_size = in_dims.size(); + out_shape.reserve(in_dims_size - stop_axis + start_axis); + + for (int i = 0; i < start_axis; ++i) { + out_shape.push_back(in_dims[i]); + } + for (int i = start_axis; i <= stop_axis; i++) { + outer *= in_dims[i]; + } + out_shape.push_back(outer); + for (int i = stop_axis + 1; i < in_dims_size; i++) { + out_shape.push_back(in_dims[i]); + } + + return out_shape; + } +}; + +class FlattenContiguousRangeOpMaker : public FlattenOpMaker { + public: + void Make() override { + AddInput("X", "(Tensor) A tensor of rank >= axis."); + AddOutput("Out", + "A 2D tensor is reshaped input tensor. The input dimensions" + "up to axis are flattened to the outer dimension of the output" + "and the remaining input dimensions are flattened into the inner" + "dimension of the output."); + AddAttr("start_axis", + "(int)" + "Indicate the input start dimension (exclusive) to flatten") + .SetDefault(1); + AddAttr("stop_axis", + "(int)" + "Indicate the input stop dimension (exclusive) to flatten") + .SetDefault(1); + AddComment(R"DOC( +Flatten Operator + +Flattens the input tensor into a new matrix according to start_axis and stop_axis. + +Examples: +Case 1: + Given + X.shape = (3, 100, 100, 4) + and + start_axis = 2, stop_axis = -1 + We get: + Out.shape = (3, 100, 400) + +Case 2: + Given + X.shape = (3, 100, 100, 4) + and + start_axis = 0, stop_axis = -1 + We get: + Out.shape = (3 * 100 * 100 * 4) +)DOC"); + AddOutput("XShape", + "XShape is just used to store the shape and lod of X, which will " + "be used in FlattenGradOp.") + .AsIntermediate(); + } +}; + +template +class FlattenContiguousRangeGradOpMaker + : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("flatten_contiguous_range_grad"); + grad_op->SetInput("XShape", this->Output("XShape")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + grad_op->SetAttrMap(this->Attrs()); + } +}; + +class FlattenContiguousRangeGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *context) const override { + OP_INOUT_CHECK(context->HasInput("XShape"), "Input", "XShape", + "FlattenContiguousRangeGrad"); + OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), "Input", + framework::GradVarName("Out"), "FlattenContiguousRangeGrad"); + auto xshape_dims = context->GetInputDim("XShape"); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + context->SetOutputDim(framework::GradVarName("X"), x_dims); + context->ShareLoD("XShape", framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.device_context()); + } +}; DECLARE_INPLACE_OP_INFERER(FlattenOpInplaceInferer, {"X", "Out"}); DECLARE_INPLACE_OP_INFERER(FlattenGradInplaceInferer, {framework::GradVarName("Out"), @@ -266,6 +416,16 @@ REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker, REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp, ops::FlattenGradInplaceInferer); +REGISTER_OPERATOR( + flatten_contiguous_range, ops::FlattenContiguousRangeOp, + ops::FlattenContiguousRangeOpMaker, + ops::FlattenContiguousRangeGradOpMaker, + ops::FlattenContiguousRangeGradOpMaker, + ops::FlattenOpInplaceInferer); +REGISTER_OPERATOR(flatten_contiguous_range_grad, + ops::FlattenContiguousRangeGradOp, + ops::FlattenGradInplaceInferer); + REGISTER_OP_CPU_KERNEL( flatten, ops::FlattenKernel, ops::FlattenKernel, @@ -292,3 +452,26 @@ REGISTER_OP_CPU_KERNEL( ops::Flatten2GradKernel, ops::Flatten2GradKernel, ops::Flatten2GradKernel); +REGISTER_OP_CPU_KERNEL( + flatten_contiguous_range, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel); +REGISTER_OP_CPU_KERNEL( + flatten_contiguous_range_grad, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel); diff --git a/paddle/fluid/operators/flatten_op.cu.cc b/paddle/fluid/operators/flatten_op.cu.cc index ac4ad8e2dc1c09f5ee9f0adfb8b19e0e4ec374a4..40fda804eaab9d280fb91c97fb4c4983a28487d0 100644 --- a/paddle/fluid/operators/flatten_op.cu.cc +++ b/paddle/fluid/operators/flatten_op.cu.cc @@ -42,3 +42,26 @@ REGISTER_OP_CUDA_KERNEL( ops::Flatten2GradKernel, ops::Flatten2GradKernel, ops::Flatten2GradKernel); +REGISTER_OP_CUDA_KERNEL( + flatten_contiguous_range, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel); +REGISTER_OP_CUDA_KERNEL( + flatten_contiguous_range_grad, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel); diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index 165832c0e68bdef38f0382ea29f7655a18345805..08efaedccd4f40033dfa02a801911f6666e14ec8 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -112,5 +112,73 @@ class Flatten2GradKernel : public framework::OpKernel { } }; +template +class FlattenContiguousRangeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto &start_axis = context.Attr("start_axis"); + auto &stop_axis = context.Attr("stop_axis"); + + auto *in = context.Input("X"); + auto x_dims = in->dims(); + int in_dims_size = x_dims.size(); + int real_start_axis = start_axis, real_stop_axis = stop_axis; + if (start_axis < 0) { + real_start_axis = start_axis + in_dims_size; + } + if (stop_axis < 0) { + real_stop_axis = stop_axis + in_dims_size; + } + auto *out = context.Output("Out"); + + auto out_dims = framework::make_ddim( + GetOutputShape(real_start_axis, real_stop_axis, x_dims)); + + out->mutable_data(context.GetPlace(), in->type()); + framework::TensorCopy( + *in, context.GetPlace(), + context.template device_context(), out); + out->Resize(out_dims); + } + static std::vector GetOutputShape(const int start_axis, + const int stop_axis, + const framework::DDim &in_dims) { + int64_t outer = 1; + std::vector out_shape; + int in_dims_size = in_dims.size(); + out_shape.reserve(in_dims_size - stop_axis + start_axis); + + for (int i = 0; i < start_axis; ++i) { + out_shape.push_back(in_dims[i]); + } + for (int i = start_axis; i <= stop_axis; i++) { + outer *= in_dims[i]; + } + out_shape.push_back(outer); + for (int i = stop_axis + 1; i < in_dims_size; i++) { + out_shape.push_back(in_dims[i]); + } + + return out_shape; + } +}; + +template +class FlattenContiguousRangeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_out = + ctx.Input(framework::GradVarName("Out")); + + auto xshape_dims = ctx.Input("XShape")->dims(); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + + d_x->mutable_data(ctx.GetPlace(), d_out->type()); + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + d_x->Resize(x_dims); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 24f656140f42dfc7ca64afd0e02bdab6e2774244..3fc5f3bfc6b1633ffe835606bbac6118e6b32ca6 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -7,7 +7,12 @@ register_operators(EXCLUDES fused_fc_elementwise_layernorm_op multihead_matmul_op fused_embedding_eltwise_layernorm_op - fusion_group_op) + fusion_group_op + fusion_gru_op) + +# fusion_gru_op does not have CUDA kernel +op_library(fusion_gru_op) +file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_gru);\n") if (WITH_GPU) # fused_bn_activation_op needs cudnn 7.4.1 above diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index f6c8316e2e9fa071dc58fb8fc43baad9055c5475..d0920098f606e49d4d1a3e4cb6d8a2b6c44ca267 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -19,6 +19,9 @@ limitations under the License. */ #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/sequence2batch.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { @@ -122,8 +125,17 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FusionGRUOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; +#ifdef PADDLE_WITH_MKLDNN + if (platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } +#endif return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context()); + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout, + library); } void FusionGRUOpMaker::Make() { @@ -187,6 +199,9 @@ void FusionGRUOpMaker::Make() { "bool" "use origin mode in article https://arxiv.org/abs/1412.3555") .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddComment(R"DOC( The Fusion complete GRU Operator. This operator fuse the fully-connected operator into GRU, diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3940aae53b8ef70c15311305ce13f8929400d405 --- /dev/null +++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc @@ -0,0 +1,439 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/fusion_gru_op.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::LoDTensor; +using paddle::framework::Tensor; +using paddle::platform::CPUDeviceContext; +using paddle::platform::MKLDNNGetDataType; +using paddle::platform::MKLDNNMemDesc; +using platform::to_void_cast; + +template +class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { + public: + GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, + const platform::MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, + platform::Place cpu_place, const LoDTensor* input, + const Tensor* weight_h, const Tensor* h0, + const bool is_reverse, const int64_t N, const int64_t Ti, + const int64_t IC, const int64_t OC, + const std::string& unique_name) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(unique_name, Ti)), + N(N), + Ti(Ti), + IC(IC), + OC(OC) { + // Create memory key without Ti because weights, bias and h0 memories + // do not depend on Ti size but primitive and input/output memory do + if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() != + platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) { + memory_key_ = unique_name; + } else { + memory_key_ = unique_name + "-t:" + platform::ThreadIDasStr(); + } + + if (!this->isCached()) { + // oneDNN kernel has hardcoded activation functions + PADDLE_ENFORCE_EQ( + ctx.Attr("gate_activation"), "sigmoid", + platform::errors::Unimplemented( + "oneDNN fusion_gru supports only sigmoid as a gate activation.")); + PADDLE_ENFORCE_EQ( + ctx.Attr("activation"), "tanh", + platform::errors::Unimplemented( + "oneDNN fusion_gru supports only tanh as an activation.")); + + // oneDNN RNN dimensions + const int64_t D = 1; // Directions + const int64_t L = 1; // Layers (PP supports only 1 stacked layer) + const int64_t G = 3; // Number of Gates, 3 for GRU + + // Create memory descriptors + auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); + auto weight_x_md = MKLDNNMemDesc( + {L, D, IC, G, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::any); + auto weight_h_md = MKLDNNMemDesc( + {L, D, OC, G, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::any); + auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType(), + MKLDNNMemoryFormat::ldgo); + auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); + auto h0_md = dnnl::memory::desc(); + if (h0) { + h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType(), + MKLDNNMemoryFormat::ldnc); + } + + // Create GRU oneDNN primitive + const auto direction = + is_reverse ? dnnl::rnn_direction::unidirectional_right2left + : dnnl::rnn_direction::unidirectional_left2right; + + this->AcquireForwardPrimitiveDescriptor( + dnnl::prop_kind::forward_inference, direction, input_md, h0_md, + weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc()); + } + } + + bool is_NTC() { + return (platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc()) == + dnnl::memory::format_tag::ntc); + } + + void reorderRNNdata(const T* input_data, T* output_data, + std::vector lod, const bool is_reverse, + platform::RNNReorderType reorder_type) { + switch (reorder_type) { + // Reorder input memory [WORDS, C] + LoD -> [N, T, C] + case platform::RNNReorderType::PP_NTC: { + auto* input_data_iter = input_data; + for (int n = 0; n < N; ++n) { + const auto num_elements = (lod[n + 1] - lod[n]) * IC; + const auto offset = is_reverse ? (Ti * IC - num_elements) : 0; + memcpy(output_data + n * Ti * IC + offset, input_data_iter, + sizeof(T) * num_elements); + input_data_iter += num_elements; + } + } break; + // Reorder input memory [WORDS, C] + LoD -> [T, N, C] + case platform::RNNReorderType::PP_TNC: { + auto* input_data_iter = input_data; + for (int n = 0; n < N; ++n) { + const auto num_elements = (lod[n + 1] - lod[n]); + const auto offset = is_reverse ? (Ti - num_elements) : 0; + for (size_t t = 0; t < num_elements; ++t) { + memcpy(output_data + (t + offset) * N * IC + n * IC, + input_data_iter, sizeof(T) * IC); + input_data_iter += IC; + } + } + } break; + // Reorder output values to PP format [N, T, C] -> [WORDS, C] + case platform::RNNReorderType::NTC_PP: { + auto* output_data_iter = output_data; + for (int n = 0; n < N; ++n) { + const auto num_elements = (lod[n + 1] - lod[n]) * OC; + const auto offset = is_reverse ? (Ti * OC - num_elements) : 0; + memcpy(output_data_iter, input_data + n * Ti * OC + offset, + sizeof(T) * num_elements); + output_data_iter += num_elements; + } + } break; + // Reorder output values to PP format [T, N, C] -> [WORDS, C] + case platform::RNNReorderType::TNC_PP: { + auto* output_data_iter = output_data; + for (int n = 0; n < N; ++n) { + const auto num_elements = lod[n + 1] - lod[n]; + const auto offset = is_reverse ? (Ti - num_elements) : 0; + for (size_t t = 0; t < num_elements; ++t) { + memcpy(output_data_iter, + input_data + (t + offset) * N * OC + n * OC, sizeof(T) * OC); + output_data_iter += OC; + } + } + } break; + } + } + + std::shared_ptr AcquireInputMemoryWithReorder( + const LoDTensor* input, const bool is_reverse) { + const auto name = this->key_ + "@input_mem"; + auto memory_p = + std::static_pointer_cast(this->dev_ctx_.GetBlob(name)); + + if (!memory_p) { + memory_p = std::make_shared(this->fwd_pd_->src_desc(), + this->engine_); + this->dev_ctx_.SetBlob(name, memory_p); + } + + const auto& input_lod = input->lod()[0]; + auto* x_data = input->data(); + + auto* x_onednn_data = reinterpret_cast(memory_p->get_data_handle()); + memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC); + + if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) == + dnnl::memory::format_tag::ntc) { + reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse, + platform::RNNReorderType::PP_NTC); + } else { + reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse, + platform::RNNReorderType::PP_TNC); + } + return memory_p; + } + + std::shared_ptr AcquireOutputMemory() { + const auto name = this->key_ + "@output_mem"; + auto memory_p = + std::static_pointer_cast(this->dev_ctx_.GetBlob(name)); + + if (!memory_p) { + memory_p = std::make_shared(this->fwd_pd_->dst_desc(), + this->engine_); + this->dev_ctx_.SetBlob(name, memory_p); + } + return memory_p; + } + + std::shared_ptr AcquireH0Memory(const Tensor* h0) { + const std::string h0_key = memory_key_ + "@h0"; + auto memory_p = + std::static_pointer_cast(this->dev_ctx_.GetBlob(h0_key)); + + auto* h0_data = to_void_cast(h0->data()); + + if (!memory_p) { + memory_p = std::make_shared( + this->fwd_pd_->weights_layer_desc(), this->engine_, h0_data); + this->dev_ctx_.SetBlob(h0_key, memory_p); + } else { + memory_p->set_data_handle(h0_data); + } + return memory_p; + } + + std::shared_ptr AcquireWeightXMemory(const Tensor* weight_x, + const bool origin_mode) { + const std::string wx_key = memory_key_ + "@weight_x"; + auto memory_p = + std::static_pointer_cast(this->dev_ctx_.GetBlob(wx_key)); + + if (!memory_p) { + auto user_md = + MKLDNNMemDesc({1, 1, IC, 3, OC}, MKLDNNGetDataType(), + MKLDNNMemoryFormat::ldigo); + auto user_memory = dnnl::memory(user_md, this->engine_); + + auto* weight_x_data = + reinterpret_cast(user_memory.get_data_handle()); + memcpy(weight_x_data, weight_x->data(), + sizeof(float) * IC * 3 * OC); + + if (origin_mode == false) { + for (int64_t i = 0; i < IC; ++i) { + for (int64_t j = 0; j < OC; ++j) { + weight_x_data[j] *= -1; + } + weight_x_data += 3 * OC; + } + } + + memory_p = std::make_shared( + this->fwd_pd_->weights_layer_desc(), this->engine_); + + dnnl::stream astream(this->engine_); + dnnl::reorder(user_memory, *memory_p) + .execute(astream, user_memory, *memory_p); + + this->dev_ctx_.SetBlob(wx_key, memory_p); + } + return memory_p; + } + + std::shared_ptr AcquireWeightHMemory(const Tensor* weight_h, + const bool origin_mode) { + const std::string wh_key = memory_key_ + "@weight_h"; + auto memory_p = + std::static_pointer_cast(this->dev_ctx_.GetBlob(wh_key)); + + if (!memory_p) { + auto user_md = + MKLDNNMemDesc({1, 1, OC, 3, OC}, MKLDNNGetDataType(), + MKLDNNMemoryFormat::ldigo); + auto user_memory = dnnl::memory(user_md, this->engine_); + + // Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to + // oneDNN format [OC, 3OC] + auto* weight_h_data = + reinterpret_cast(user_memory.get_data_handle()); + auto* user_weight_h_data = weight_h->data(); + + auto src1_iter = user_weight_h_data; + auto src2_iter = user_weight_h_data + 2 * OC * OC; + + for (int64_t c = 0; c < OC; ++c) { + memcpy(weight_h_data, src1_iter, 2 * OC * sizeof(float)); + memcpy(weight_h_data + 2 * OC, src2_iter, OC * sizeof(float)); + + src1_iter += 2 * OC; + src2_iter += OC; + weight_h_data += 3 * OC; + } + + weight_h_data = reinterpret_cast(user_memory.get_data_handle()); + + if (origin_mode == false) { + for (int64_t i = 0; i < OC; ++i) { + for (int64_t j = 0; j < OC; ++j) { + weight_h_data[j] *= -1; + } + weight_h_data += 3 * OC; + } + } + + memory_p = std::make_shared( + this->fwd_pd_->weights_iter_desc(), this->engine_); + + dnnl::stream astream(this->engine_); + dnnl::reorder(user_memory, *memory_p) + .execute(astream, user_memory, *memory_p); + + this->dev_ctx_.SetBlob(wh_key, memory_p); + } + return memory_p; + } + + std::shared_ptr AcquireBiasMemory(const Tensor* bias, + const bool origin_mode) { + const std::string bias_key = memory_key_ + "@bias"; + auto memory_p = std::static_pointer_cast( + this->dev_ctx_.GetBlob(bias_key)); + + if (!memory_p) { + memory_p = std::make_shared(this->fwd_pd_->bias_desc(), + this->engine_); + auto* bias_data = reinterpret_cast(memory_p->get_data_handle()); + if (bias) { + const float* user_bias_data = + bias->data(); // Bias in oneDNN is always float + memcpy(bias_data, user_bias_data, sizeof(float) * 3 * OC); + } else { + // oneDNN always need bias memory, if it's not provided in PP, let + // oneDNN allocate memory and set it to 0 + memset(bias_data, 0, sizeof(float) * 3 * OC); + } + + if (origin_mode == false && bias) { + for (int64_t i = 0; i < OC; ++i) { + bias_data[i] *= -1; + } + } + this->dev_ctx_.SetBlob(bias_key, memory_p); + } + return memory_p; + } + + private: + // RNN dimensions + // N - Batch Size + // Ti - Max sentence length + // IC - Input Channels + // OC - Output Channels + const int64_t N, Ti, IC, OC; + + // Memory size of weights, bias and h0 does not depend + // on Ti size, thus we need another key to cache them + std::string memory_key_; +}; + +template +class FusionGRUMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + // Get Tensors + const auto* input = ctx.Input("X"); + const auto* h0 = ctx.Input("H0"); + const auto* weight_x = ctx.Input("WeightX"); + const auto* weight_h = ctx.Input("WeightH"); + const auto* bias = ctx.Input("Bias"); + auto* hidden = ctx.Output("Hidden"); + + // Get attributes + const bool is_reverse = ctx.Attr("is_reverse"); + const bool origin_mode = ctx.Attr("origin_mode"); + + // Get tensor dimensions + const auto x_dims = framework::vectorize(input->dims()); + const auto weight_h_dims = framework::vectorize(weight_h->dims()); + const auto& input_lod = input->lod()[0]; + + // Calculate RNN dimensions + const int64_t N = input_lod.size() - 1; // Number of sentences (batches) + const int64_t Ti = // Max length of the sentence in a batch + [&input_lod]() { + size_t res = 0; + for (size_t i = 0; i < (input_lod.size() - 1); ++i) { + res = std::max(res, input_lod[i + 1] - input_lod[i]); + } + return res; + }(); + const int64_t IC = x_dims[1]; // Input channels + const int64_t OC = weight_h_dims[0]; // Output channels + + GRUMKLDNNHandler handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), + input, weight_h, h0, is_reverse, N, Ti, IC, OC, + ctx.InputName("X") + ctx.InputName("WeightH")); + + auto input_memory_p = + handler.AcquireInputMemoryWithReorder(input, is_reverse); + auto weight_x_memory_p = + handler.AcquireWeightXMemory(weight_x, origin_mode); + auto weight_h_memory_p = + handler.AcquireWeightHMemory(weight_h, origin_mode); + auto bias_memory_p = handler.AcquireBiasMemory(bias, origin_mode); + auto hidden_onednn_memory_p = handler.AcquireOutputMemory(); + + std::unordered_map gru_args = { + {DNNL_ARG_SRC_LAYER, *input_memory_p}, + {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p}, + {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p}, + {DNNL_ARG_BIAS, *bias_memory_p}, + {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}}; + + if (h0) { + auto h0_memory_p = handler.AcquireH0Memory(h0); + gru_args.insert({DNNL_ARG_SRC_ITER, *h0_memory_p}); + } + + auto gru_forward_p = handler.AcquireForwardPrimitive(); + + dnnl::stream astream(mkldnn_engine); + gru_forward_p->execute(astream, gru_args); + astream.wait(); + + auto* hidden_onednn_data = + reinterpret_cast(hidden_onednn_memory_p->get_data_handle()); + auto* hidden_data = hidden->mutable_data(ctx.GetPlace()); + if (handler.is_NTC()) { + handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod, + is_reverse, platform::RNNReorderType::NTC_PP); + } else { + handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod, + is_reverse, platform::RNNReorderType::TNC_PP); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace, + ops::FusionGRUMKLDNNKernel); diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index c147bdccbe99e505a8fd8f1ec75c487b00c02067..60588d89db803fe6421d16b48ec98e1f02ad522c 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -181,6 +181,8 @@ inline mkldnn::memory::format_tag GetMKLDNNFormat( if (inner_nblks == 0) { if (strides[0] >= strides[1] && strides[1] >= strides[2]) { return mkldnn::memory::format_tag::ncw; + } else if (strides[1] >= strides[0] && strides[0] >= strides[2]) { + return mkldnn::memory::format_tag::ntc; } else { return mkldnn::memory::format_tag::nwc; } @@ -420,5 +422,7 @@ inline std::vector> ToMkldnnPadding( } } +enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP }; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index dbc9eb065c4240a7d2dc135965f23ddc153bfd16..b5165078cb17fe404d7a12230f02283b41391a3f 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,7 +1,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context - gloo_wrapper infer_io_utils) + gloo_wrapper infer_io_utils heter_wrapper) if (WITH_NCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper) @@ -31,6 +31,7 @@ set(PYBIND_SRCS global_value_getter_setter.cc reader_py.cc fleet_wrapper_py.cc + heter_wrapper_py.cc gloo_wrapper_py.cc box_helper_py.cc data_set_py.cc diff --git a/paddle/fluid/pybind/heter_wrapper_py.cc b/paddle/fluid/pybind/heter_wrapper_py.cc new file mode 100644 index 0000000000000000000000000000000000000000..0603cd3faae1d7007ec31813373019172baf6eb7 --- /dev/null +++ b/paddle/fluid/pybind/heter_wrapper_py.cc @@ -0,0 +1,50 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#ifdef _POSIX_C_SOURCE +#undef _POSIX_C_SOURCE +#endif + +#ifdef _XOPEN_SOURCE +#undef _XOPEN_SOURCE +#endif + +#include +#include + +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/text_format.h" +#include "paddle/fluid/framework/fleet/heter_wrapper.h" +#include "paddle/fluid/pybind/heter_wrapper_py.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { +#ifdef PADDLE_WITH_PSLIB +void BindHeterWrapper(py::module* m) { + py::class_>( + *m, "Heter") + .def(py::init([]() { return framework::HeterWrapper::GetInstance(); })) + .def("create_client2xpu_connection", + &framework::HeterWrapper::CreateClient2XpuConnection) + .def("set_xpu_list", &framework::HeterWrapper::SetXpuList) + .def("start_xpu_service", &framework::HeterWrapper::StartXpuService) + .def("end_pass", &framework::HeterWrapper::EndPass) + .def("stop_xpu_service", &framework::HeterWrapper::StopXpuService); +} // end HeterWrapper +#endif +} // end namespace pybind +} // end namespace paddle diff --git a/paddle/fluid/pybind/heter_wrapper_py.h b/paddle/fluid/pybind/heter_wrapper_py.h new file mode 100644 index 0000000000000000000000000000000000000000..047c1f359ea8397998b967afbe30f5c0ff8d6170 --- /dev/null +++ b/paddle/fluid/pybind/heter_wrapper_py.h @@ -0,0 +1,29 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +#ifdef PADDLE_WITH_PSLIB +void BindHeterWrapper(py::module* m); +#endif +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d58c36dd8f20e35fe4a564bd7e119c17f1296ba2..d6a8b226637c15b76d869a23daf7168a7ac51211 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -66,6 +66,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/fleet_wrapper_py.h" #include "paddle/fluid/pybind/global_value_getter_setter.h" #include "paddle/fluid/pybind/gloo_wrapper_py.h" +#include "paddle/fluid/pybind/heter_wrapper_py.h" #include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/inference_api.h" #include "paddle/fluid/pybind/ir.h" @@ -2479,6 +2480,9 @@ All parameter, weight, gradient are variables in Paddle. .def("device_count", &ParallelExecutor::DeviceCount); BindFleetWrapper(&m); +#ifdef PADDLE_WITH_PSLIB + BindHeterWrapper(&m); +#endif BindGlooWrapper(&m); BindBoxHelper(&m); #ifdef PADDLE_WITH_BOX_PS diff --git a/python/paddle/fleet/__init__.py b/python/paddle/fleet/__init__.py index b25c362ce9301c122d2e2b6915e444da6a90ceca..cc5ce0f2b74b6193652b7ec7ed4e03407df296c5 100644 --- a/python/paddle/fleet/__init__.py +++ b/python/paddle/fleet/__init__.py @@ -16,10 +16,13 @@ from .base.distributed_strategy import DistributedStrategy from .base.fleet_base import Fleet from .base.util_factory import UtilBase - +from .dataset import * #from .base.role_maker import PaddleCloudRoleMaker -__all__ = ["DistributedStrategy", "UtilBase"] +__all__ = [ + "DistributedStrategy", "UtilBase", "DatasetFactory", "DatasetBase", + "InMemoryDataset", "QueueDataset" +] fleet = Fleet() init = fleet.init diff --git a/python/paddle/fleet/dataset/__init__.py b/python/paddle/fleet/dataset/__init__.py index 8647330f3290f3142cabca9a7e3fe162a9838dda..af33c4eafb396827335157933d51f37ca8b06011 100644 --- a/python/paddle/fleet/dataset/__init__.py +++ b/python/paddle/fleet/dataset/__init__.py @@ -10,3 +10,5 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and + +from .dataset import * diff --git a/python/paddle/fleet/dataset/dataset.py b/python/paddle/fleet/dataset/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..f6504cacd9680806a13b4bb815247124b7e6a23c --- /dev/null +++ b/python/paddle/fleet/dataset/dataset.py @@ -0,0 +1,1103 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This is definition of dataset class, which is high performance IO.""" + +import paddle +import paddle.fluid as fluid +from paddle.fluid.proto import data_feed_pb2 +from google.protobuf import text_format +import paddle.fluid.core as core + + +class DatasetFactory(object): + """ + DatasetFactory is a factory which create dataset by its name, + you can create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset", + the default is "QueueDataset". + + Example: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + + """ + + def __init__(self): + """ Init. """ + pass + + def create_dataset(self, datafeed_class="QueueDataset"): + """ + Create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset", + the default is "QueueDataset". + + Args: + datafeed_class(str): datafeed class name, QueueDataset or InMemoryDataset. + Default is QueueDataset. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + + """ + try: + dataset = globals()[datafeed_class]() + return dataset + except: + raise ValueError("datafeed class %s does not exist" % + datafeed_class) + + +class DatasetBase(object): + """ Base dataset class. """ + + def __init__(self): + """ Init. """ + # define class name here + # to decide whether we need create in memory instance + self.proto_desc = data_feed_pb2.DataFeedDesc() + self.proto_desc.pipe_command = "cat" + self.dataset = core.Dataset("MultiSlotDataset") + self.thread_num = 1 + self.filelist = [] + + def set_pipe_command(self, pipe_command): + """ + Set pipe command of current dataset + A pipe command is a UNIX pipeline command that can be used only + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_pipe_command("python my_script.py") + + Args: + pipe_command(str): pipe command + + """ + self.proto_desc.pipe_command = pipe_command + + def set_rank_offset(self, rank_offset): + """ + Set rank_offset for merge_pv. It set the message of Pv. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_rank_offset("rank_offset") + + Args: + rank_offset(str): rank_offset's name + + """ + self.proto_desc.rank_offset = rank_offset + + def set_fea_eval(self, record_candidate_size, fea_eval=True): + """ + set fea eval mode for slots shuffle to debug the importance level of + slots(features), fea_eval need to be set True for slots shuffle. + + Args: + record_candidate_size(int): size of instances candidate to shuffle + one slot + fea_eval(bool): whether enable fea eval mode to enable slots shuffle. + default is True. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_fea_eval(1000000, True) + + """ + if fea_eval: + self.dataset.set_fea_eval(fea_eval, record_candidate_size) + self.fea_eval = fea_eval + + def slots_shuffle(self, slots): + """ + Slots Shuffle + Slots Shuffle is a shuffle method in slots level, which is usually used + in sparse feature with large scale of instances. To compare the metric, i.e. + auc while doing slots shuffle on one or several slots with baseline to + evaluate the importance level of slots(features). + + Args: + slots(list[string]): the set of slots(string) to do slots shuffle. + + Examples: + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_merge_by_lineid() + #suppose there is a slot 0 + dataset.slots_shuffle(['0']) + """ + if self.fea_eval: + slots_set = set(slots) + self.dataset.slots_shuffle(slots_set) + + def set_batch_size(self, batch_size): + """ + Set batch size. Will be effective during training + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_batch_size(128) + + Args: + batch_size(int): batch size + + """ + self.proto_desc.batch_size = batch_size + + def set_pv_batch_size(self, pv_batch_size): + """ + Set pv batch size. It will be effective during enable_pv_merge + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_pv_batch(128) + Args: + pv_batch_size(int): pv batch size + + """ + self.proto_desc.pv_batch_size = pv_batch_size + + def set_thread(self, thread_num): + """ + Set thread num, it is the num of readers. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_thread(12) + + Args: + thread_num(int): thread num + """ + self.dataset.set_thread_num(thread_num) + self.thread_num = thread_num + + def set_filelist(self, filelist): + """ + Set file list in current worker. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_filelist(['a.txt', 'b.txt']) + + Args: + filelist(list): file list + """ + self.dataset.set_filelist(filelist) + self.filelist = filelist + + def set_input_type(self, input_type): + self.proto_desc.input_type = input_type + + def set_use_var(self, var_list): + """ + Set Variables which you will use. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_use_var([data, label]) + + Args: + var_list(list): variable list + """ + multi_slot = self.proto_desc.multi_slot_desc + for var in var_list: + slot_var = multi_slot.slots.add() + slot_var.is_used = True + slot_var.name = var.name + if var.lod_level == 0: + slot_var.is_dense = True + slot_var.shape.extend(var.shape) + if var.dtype == core.VarDesc.VarType.FP32: + slot_var.type = "float" + elif var.dtype == core.VarDesc.VarType.INT64: + slot_var.type = "uint64" + else: + raise ValueError( + "Currently, fluid.dataset only supports dtype=float32 and dtype=int64" + ) + + def set_hdfs_config(self, fs_name, fs_ugi): + """ + Set hdfs config: fs name ad ugi + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_hdfs_config("my_fs_name", "my_fs_ugi") + + Args: + fs_name(str): fs name + fs_ugi(str): fs ugi + """ + self.dataset.set_hdfs_config(fs_name, fs_ugi) + + def set_download_cmd(self, download_cmd): + """ + Set customized download cmd: download_cmd + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_download_cmd("./read_from_afs") + + Args: + download_cmd(str): customized download command + """ + self.dataset.set_download_cmd(download_cmd) + + def _prepare_to_run(self): + """ + Set data_feed_desc before load or shuffle, + user no need to call this function. + """ + if self.thread_num > len(self.filelist): + self.thread_num = len(self.filelist) + self.dataset.set_thread_num(self.thread_num) + self.dataset.set_data_feed_desc(self.desc()) + self.dataset.create_readers() + + def _finish_to_run(self): + self.dataset.destroy_readers() + + def desc(self): + """ + Returns a protobuf message for this DataFeedDesc + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + print(dataset.desc()) + + Returns: + A string message + """ + return text_format.MessageToString(self.proto_desc) + + def _dynamic_adjust_before_train(self, thread_num): + pass + + def _dynamic_adjust_after_train(self): + pass + + +class InMemoryDataset(DatasetBase): + """ + InMemoryDataset, it will load data into memory + and shuffle data before training. + This class should be created by DatasetFactory + + Example: + dataset = paddle.fluid.DatasetFactory().create_dataset("InMemoryDataset") + """ + + def __init__(self): + """ Init. """ + super(InMemoryDataset, self).__init__() + self.proto_desc.name = "MultiSlotInMemoryDataFeed" + self.fleet_send_batch_size = None + self.is_user_set_queue_num = False + self.queue_num = None + self.parse_ins_id = False + self.parse_content = False + self.parse_logkey = False + self.merge_by_sid = True + self.enable_pv_merge = False + self.merge_by_lineid = False + self.fleet_send_sleep_seconds = None + + def set_feed_type(self, data_feed_type): + """ + Set data_feed_desc + """ + self.proto_desc.name = data_feed_type + + def _prepare_to_run(self): + """ + Set data_feed_desc before load or shuffle, + user no need to call this function. + """ + if self.thread_num <= 0: + self.thread_num = 1 + self.dataset.set_thread_num(self.thread_num) + if self.queue_num is None: + self.queue_num = self.thread_num + self.dataset.set_queue_num(self.queue_num) + self.dataset.set_parse_ins_id(self.parse_ins_id) + self.dataset.set_parse_content(self.parse_content) + self.dataset.set_parse_logkey(self.parse_logkey) + self.dataset.set_merge_by_sid(self.merge_by_sid) + self.dataset.set_enable_pv_merge(self.enable_pv_merge) + self.dataset.set_data_feed_desc(self.desc()) + self.dataset.create_channel() + self.dataset.create_readers() + + def _dynamic_adjust_before_train(self, thread_num): + if not self.is_user_set_queue_num: + self.dataset.dynamic_adjust_channel_num(thread_num, False) + self.dataset.dynamic_adjust_readers_num(thread_num) + + def _dynamic_adjust_after_train(self): + if not self.is_user_set_queue_num: + self.dataset.dynamic_adjust_channel_num(self.thread_num, False) + self.dataset.dynamic_adjust_readers_num(self.thread_num) + + def set_queue_num(self, queue_num): + """ + Set Dataset output queue num, training threads get data from queues + + Args: + queue_num(int): dataset output queue num + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_queue_num(12) + + """ + self.is_user_set_queue_num = True + self.queue_num = queue_num + + def set_parse_ins_id(self, parse_ins_id): + """ + Set id Dataset need to parse insid + + Args: + parse_ins_id(bool): if parse ins_id or not + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_parse_ins_id(True) + + """ + self.parse_ins_id = parse_ins_id + + def set_parse_content(self, parse_content): + """ + Set if Dataset need to parse content + + Args: + parse_content(bool): if parse content or not + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_parse_content(True) + + """ + self.parse_content = parse_content + + def set_parse_logkey(self, parse_logkey): + """ + Set if Dataset need to parse logkey + + Args: + parse_content(bool): if parse logkey or not + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_parse_logkey(True) + + """ + self.parse_logkey = parse_logkey + + def set_merge_by_sid(self, merge_by_sid): + """ + Set if Dataset need to merge sid. If not, one ins means one Pv. + + Args: + merge_by_sid(bool): if merge sid or not + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_merge_by_sid(True) + + """ + self.merge_by_sid = merge_by_sid + + def set_enable_pv_merge(self, enable_pv_merge): + """ + Set if Dataset need to merge pv. + + Args: + enable_pv_merge(bool): if enable_pv_merge or not + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_enable_pv_merge(True) + + """ + self.enable_pv_merge = enable_pv_merge + + def preprocess_instance(self): + """ + Merge pv instance and convey it from input_channel to input_pv_channel. + It will be effective when enable_pv_merge_ is True. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.load_into_memory() + dataset.preprocess_instance() + + """ + self.dataset.preprocess_instance() + + def set_current_phase(self, current_phase): + """ + Set current phase in train. It is useful for untest. + current_phase : 1 for join, 0 for update. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.load_into_memory() + dataset.set_current_phase(1) + + """ + self.dataset.set_current_phase(current_phase) + + def postprocess_instance(self): + """ + Divide pv instance and convey it to input_channel. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.load_into_memory() + dataset.preprocess_instance() + exe.train_from_dataset(dataset) + dataset.postprocess_instance() + + """ + self.dataset.postprocess_instance() + + def set_fleet_send_batch_size(self, fleet_send_batch_size=1024): + """ + Set fleet send batch size, default is 1024 + + Args: + fleet_send_batch_size(int): fleet send batch size + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_fleet_send_batch_size(800) + + """ + self.fleet_send_batch_size = fleet_send_batch_size + + def set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0): + """ + Set fleet send sleep time, default is 0 + + Args: + fleet_send_sleep_seconds(int): fleet send sleep time + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_fleet_send_sleep_seconds(2) + + """ + self.fleet_send_sleep_seconds = fleet_send_sleep_seconds + + def set_merge_by_lineid(self, merge_size=2): + """ + Set merge by line id, instances of same line id will be merged after + shuffle, you should parse line id in data generator. + + Args: + merge_size(int): ins size to merge. default is 2. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_merge_by_lineid() + + """ + self.dataset.set_merge_by_lineid(merge_size) + self.merge_by_lineid = True + self.parse_ins_id = True + + def set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num): + self.dataset.set_generate_unique_feasigns(generate_uni_feasigns) + self.gen_uni_feasigns = generate_uni_feasigns + self.local_shard_num = shard_num + + def generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num, + consume_thread_num, shard_num): + self.dataset.generate_local_tables_unlock( + table_id, fea_dim, read_thread_num, consume_thread_num, shard_num) + + def load_into_memory(self): + """ + Load data into memory + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.load_into_memory() + """ + self._prepare_to_run() + self.dataset.load_into_memory() + + def preload_into_memory(self, thread_num=None): + """ + Load data into memory in async mode + + Args: + thread_num(int): preload thread num + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.preload_into_memory() + dataset.wait_preload_done() + """ + self._prepare_to_run() + if thread_num is None: + thread_num = self.thread_num + self.dataset.set_preload_thread_num(thread_num) + self.dataset.create_preload_readers() + self.dataset.preload_into_memory() + + def wait_preload_done(self): + """ + Wait preload_into_memory done + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.preload_into_memory() + dataset.wait_preload_done() + """ + self.dataset.wait_preload_done() + self.dataset.destroy_preload_readers() + + def local_shuffle(self): + """ + Local shuffle + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.load_into_memory() + dataset.local_shuffle() + """ + self.dataset.local_shuffle() + + def global_shuffle(self, fleet=None, thread_num=12): + """ + Global shuffle. + Global shuffle can be used only in distributed mode. i.e. multiple + processes on single machine or multiple machines training together. + If you run in distributed mode, you should pass fleet instead of None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.load_into_memory() + dataset.global_shuffle(fleet) + + Args: + fleet(Fleet): fleet singleton. Default None. + thread_num(int): shuffle thread num. Default is 12. + + """ + trainer_num = 1 + if fleet is not None: + fleet._role_maker.barrier_worker() + trainer_num = fleet.worker_num() + if self.fleet_send_batch_size is None: + self.fleet_send_batch_size = 1024 + if self.fleet_send_sleep_seconds is None: + self.fleet_send_sleep_seconds = 0 + self.dataset.register_client2client_msg_handler() + self.dataset.set_trainer_num(trainer_num) + self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size) + self.dataset.set_fleet_send_sleep_seconds(self.fleet_send_sleep_seconds) + if fleet is not None: + fleet._role_maker.barrier_worker() + self.dataset.global_shuffle(thread_num) + if fleet is not None: + fleet._role_maker.barrier_worker() + if self.merge_by_lineid: + self.dataset.merge_by_lineid() + if fleet is not None: + fleet._role_maker.barrier_worker() + + def release_memory(self): + """ + :api_attr: Static Graph + + Release InMemoryDataset memory data, when data will not be used again. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.load_into_memory() + dataset.global_shuffle(fleet) + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + exe.train_from_dataset(fluid.default_main_program(), dataset) + dataset.release_memory() + + """ + self.dataset.release_memory() + + def get_pv_data_size(self): + """ + Get memory data size of Pv, user can call this function to know the pv num + of ins in all workers after load into memory. + + Note: + This function may cause bad performance, because it has barrier + + Returns: + The size of memory pv data. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.load_into_memory() + print dataset.get_pv_data_size() + + """ + return self.dataset.get_pv_data_size() + + def get_memory_data_size(self, fleet=None): + """ + Get memory data size, user can call this function to know the num + of ins in all workers after load into memory. + + Note: + This function may cause bad performance, because it has barrier + + Args: + fleet(Fleet): Fleet Object. + + Returns: + The size of memory data. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.load_into_memory() + print dataset.get_memory_data_size(fleet) + + """ + import numpy as np + local_data_size = self.dataset.get_memory_data_size() + local_data_size = np.array([local_data_size]) + if fleet is not None: + global_data_size = local_data_size * 0 + fleet._role_maker.all_reduce_worker(local_data_size, + global_data_size) + return global_data_size[0] + return local_data_size[0] + + def get_shuffle_data_size(self, fleet=None): + """ + Get shuffle data size, user can call this function to know the num + of ins in all workers after local/global shuffle. + + Note: + This function may cause bad performance to local shuffle, + because it has barrier. It does not affect global shuffle. + + Args: + fleet(Fleet): Fleet Object. + + Returns: + The size of shuffle data. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.load_into_memory() + dataset.global_shuffle(fleet) + print dataset.get_shuffle_data_size(fleet) + + """ + import numpy as np + local_data_size = self.dataset.get_shuffle_data_size() + local_data_size = np.array([local_data_size]) + if fleet is not None: + global_data_size = local_data_size * 0 + fleet._role_maker.all_reduce_worker(local_data_size, + global_data_size) + return global_data_size[0] + return local_data_size[0] + + +class QueueDataset(DatasetBase): + """ + QueueDataset, it will process data streamly. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("QueueDataset") + + """ + + def __init__(self): + """ + Initialize QueueDataset + This class should be created by DatasetFactory + """ + super(QueueDataset, self).__init__() + self.proto_desc.name = "MultiSlotDataFeed" + + def _prepare_to_run(self): + """ + Set data_feed_desc/thread num/filelist before run, + user no need to call this function. + """ + if self.thread_num > len(self.filelist): + self.thread_num = len(self.filelist) + if self.thread_num == 0: + self.thread_num = 1 + self.dataset.set_thread_num(self.thread_num) + self.dataset.set_filelist(self.filelist) + self.dataset.set_data_feed_desc(self.desc()) + self.dataset.create_readers() + + def local_shuffle(self): + """ + Local shuffle data. + + Local shuffle is not supported in QueueDataset + NotImplementedError will be raised + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("QueueDataset") + dataset.local_shuffle() + + Raises: + NotImplementedError: QueueDataset does not support local shuffle + + """ + raise NotImplementedError( + "QueueDataset does not support local shuffle, " + "please use InMemoryDataset for local_shuffle") + + def global_shuffle(self, fleet=None): + """ + Global shuffle data. + + Global shuffle is not supported in QueueDataset + NotImplementedError will be raised + + Args: + fleet(Fleet): fleet singleton. Default None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet + dataset = fluid.DatasetFactory().create_dataset("QueueDataset") + dataset.global_shuffle(fleet) + + Raises: + NotImplementedError: QueueDataset does not support global shuffle + + """ + raise NotImplementedError( + "QueueDataset does not support global shuffle, " + "please use InMemoryDataset for global_shuffle") + + +class FileInstantDataset(DatasetBase): + """ + FileInstantDataset, it will process data streamly. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory.create_dataset("FileInstantDataset") + """ + + def __init__(self): + """ + Initialize FileInstantDataset + This class should be created by DatasetFactory + """ + super(FileInstantDataset, self).__init__() + self.proto_desc.name = "MultiSlotFileInstantDataFeed" + + def local_shuffle(self): + """ + Local shuffle + FileInstantDataset does not support local shuffle + """ + raise NotImplementedError( + "FileInstantDataset does not support local shuffle, " + "please use InMemoryDataset for local_shuffle") + + def global_shuffle(self, fleet=None): + """ + Global shuffle + FileInstantDataset does not support global shuffle + """ + raise NotImplementedError( + "FileInstantDataset does not support global shuffle, " + "please use InMemoryDataset for global_shuffle") + + +class BoxPSDataset(InMemoryDataset): + """ + BoxPSDataset: derived from InMemoryDataset. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset") + """ + + def __init__(self): + """ + Initialize BoxPSDataset + This class should be created by DatasetFactory + """ + super(BoxPSDataset, self).__init__() + self.boxps = core.BoxPS(self.dataset) + self.proto_desc.name = "PaddleBoxDataFeed" + + def set_date(self, date): + """ + Workaround for date + """ + year = int(date[:4]) + month = int(date[4:6]) + day = int(date[6:]) + self.boxps.set_date(year, month, day) + + def begin_pass(self): + """ + Begin Pass + Notify BoxPS to load sparse parameters of next pass to GPU Memory + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset") + dataset.begin_pass() + """ + self.boxps.begin_pass() + + def end_pass(self, need_save_delta): + """ + End Pass + Notify BoxPS that current pass ended + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset") + dataset.end_pass(True) + """ + self.boxps.end_pass(need_save_delta) + + def wait_preload_done(self): + """ + Wait async preload done + Wait Until Feed Pass Done + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.preload_into_memory() + dataset.wait_preload_done() + """ + self.boxps.wait_feed_pass_done() + + def load_into_memory(self): + """ + Load next pass into memory and notify boxps to fetch its emb from SSD + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.load_into_memory() + """ + self._prepare_to_run() + self.boxps.load_into_memory() + + def preload_into_memory(self): + """ + Begin async preload next pass while current pass may be training + Examples: + .. code-block:: python + + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset") + filelist = ["a.txt", "b.txt"] + dataset.set_filelist(filelist) + dataset.preload_into_memory() + """ + self._prepare_to_run() + self.boxps.preload_into_memory() + + def _dynamic_adjust_before_train(self, thread_num): + if not self.is_user_set_queue_num: + self.dataset.dynamic_adjust_channel_num(thread_num, True) + self.dataset.dynamic_adjust_readers_num(thread_num) + + def _dynamic_adjust_after_train(self): + pass + + def slots_shuffle(self, slots): + """ + Slots Shuffle + Slots Shuffle is a shuffle method in slots level, which is usually used + in sparse feature with large scale of instances. To compare the metric, i.e. + auc while doing slots shuffle on one or several slots with baseline to + evaluate the importance level of slots(features). + + Args: + slots(list[string]): the set of slots(string) to do slots shuffle. + + Examples: + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_merge_by_lineid() + #suppose there is a slot 0 + dataset.slots_shuffle(['0']) + """ + slots_set = set(slots) + self.boxps.slots_shuffle(slots_set) diff --git a/python/paddle/fluid/annotations.py b/python/paddle/fluid/annotations.py deleted file mode 100644 index 15e7976354f2a22065f1723bfa696d056181dac2..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/annotations.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function -import functools -import sys - -__all__ = ['deprecated'] - - -def deprecated(since, instead, extra_message=""): - def decorator(func): - err_msg = "API {0} is deprecated since {1}. Please use {2} instead.".format( - func.__name__, since, instead) - if len(extra_message) != 0: - err_msg += "\n" - err_msg += extra_message - - @functools.wraps(func) - def wrapper(*args, **kwargs): - print(err_msg, file=sys.stderr) - return func(*args, **kwargs) - - wrapper.__doc__ += "\n " - wrapper.__doc__ += err_msg - return wrapper - - return decorator diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py index 3831dee2964992f1cc035502cef12ac4967e0a72..4796cd5ada420567fa126154cc1ac28badc0f2c0 100644 --- a/python/paddle/fluid/device_worker.py +++ b/python/paddle/fluid/device_worker.py @@ -223,7 +223,8 @@ class DownpourSGD(DeviceWorker): dense_table_set.add(i) break - trainer_desc.device_worker_name = "DownpourWorker" + trainer_desc.device_worker_name = opt_info.get("worker_class", + "DownpourWorker") pull_thread = trainer_desc.pull_dense_param pull_thread.device_num = trainer_desc.thread_num if opt_info.get("program_id_to_worker") is None: diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py index 6272f7369ec6db0cf7b3e5d82f689ddabf3e19ab..79e812ff6192bc09e6e8c71397c6a239011dfae6 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py @@ -36,7 +36,6 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager from paddle.fluid.dygraph.base import param_guard from paddle.fluid.data_feeder import check_type from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from -from paddle.fluid.annotations import deprecated __all__ = ['ProgramTranslator', 'convert_to_static'] diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index cc2b746b0c1e9a00c21ebe6762ba4da38d20c511..58efa58ac8b3ba80a7ddc293681f3183c44734ef 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -35,7 +35,7 @@ __all__ = [ 'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding', 'GRUUnit', 'InstanceNorm', 'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm', - 'SpectralNorm', 'TreeConv' + 'SpectralNorm', 'TreeConv', 'Flatten' ] @@ -3182,3 +3182,62 @@ class TreeConv(layers.Layer): else: pre_activation = out return self._helper.append_activation(pre_activation, act=self._act) + + +class Flatten(layers.Layer): + """ + :alias_main: paddle.nn.Flatten + :alias: paddle.nn.Flatten,paddle.nn.layer.Flatten,paddle.nn.layer.common.Flatten + This interface is used to construct a callable object of the ``FLatten`` class. + For more details, refer to code examples. + It implements flatten a contiguous range of dims into a tensor. + + Equation: + + Parameters: + start_axis(int): first dim to flatten (default = 1) + stop_axis(int): last dim to flatten (default = -1). + + Returns: + None + + Examples: + + .. code-block:: python + + import paddle + from paddle.imperative import to_variable + import numpy as np + + inp_np = np.ones([5, 2, 3, 4]).astype('float32') + + paddle.enable_imperative() + + inp_np = to_variable(inp_np) + flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2) + flatten_res = flatten(inp_np) + + """ + + def __init__(self, start_axis=1, stop_axis=-1): + super(Flatten, self).__init__() + self.start_axis = start_axis + self.stop_axis = stop_axis + + def forward(self, input): + out = self._helper.create_variable_for_type_inference(input.dtype) + x_shape = self._helper.create_variable_for_type_inference(input.dtype) + + if in_dygraph_mode(): + dy_out, _ = core.ops.flatten_contiguous_range( + input, 'start_axis', self.start_axis, 'stop_axis', + self.stop_axis) + return dy_out + self._helper.append_op( + type="flatten_contiguous_range", + inputs={"X": input}, + outputs={"Out": out, + "XShape": x_shape}, + attrs={"start_axis": self.start_axis, + "stop_axis": self.stop_axis}) + return out diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 9b22a016baa9cdc54cabb1d305518649c02b6546..a2e21cd55be4823e8cb56fa538e704d8c1c46fbd 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1300,6 +1300,12 @@ class Executor(object): fetch_list=None, fetch_info=None, print_period=100): + is_heter = 0 + if not program._fleet_opt is None: + if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker": + is_heter = 1 + if program._fleet_opt("trainer", "") == "HeterXpuTrainer": + is_heter = 1 if scope is None: scope = global_scope() if fetch_list is None: @@ -1308,6 +1314,11 @@ class Executor(object): fetch_info = [] assert len(fetch_list) == len(fetch_info) compiled = isinstance(program, compiler.CompiledProgram) + if is_heter: + from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet + from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil + fu = FleetUtil() + ret = fu.split_program_by_device(program) if not compiled: # TODO: Need a better way to distinguish and specify different execution mode if program._pipeline_opt: @@ -1317,6 +1328,8 @@ class Executor(object): trainer = TrainerFactory()._create_trainer(program._fleet_opt) trainer._set_thread_barrier(program._is_distributed) trainer._set_program(program) + if is_heter: + trainer._set_heter_info(ret) else: if program._pipeline_opt: trainer = TrainerFactory()._create_trainer( @@ -1476,6 +1489,60 @@ class Executor(object): debug, fetch_list, fetch_info, print_period, fetch_handler) + def start_heter_trainer(self, + program=None, + scope=None, + debug=False, + fetch_list=None, + fetch_info=None, + print_period=100, + fetch_handler=None): + return self._start_heter_trainer(program, scope, False, debug, + fetch_list, fetch_info, print_period, + fetch_handler) + + def _start_heter_trainer(self, + program=None, + scope=None, + is_infer=False, + debug=False, + fetch_list=None, + fetch_info=None, + print_period=100, + fetch_handler=None): + + scope, trainer = self._prepare_trainer( + program=program, + dataset=None, + scope=scope, + thread=1, + debug=debug, + fetch_list=fetch_list, + fetch_info=fetch_info, + print_period=print_period) + + trainer._set_infer(is_infer) + trainer._gen_trainer_desc() + + self._dump_debug_info(program=program, trainer=trainer) + + trainer_instance = self._default_executor.init_for_dataset( + program.desc, trainer._desc(), scope, None) + + #if fetch_handler is not None: + # scope0 = trainer_instance.get_worker_scope(0) + # fetch_monitor = FetchHandlerMonitor(scope0, fetch_handler) + # fetch_monitor.start() + # self._default_executor.run_from_dataset(trainer_instance) + # fetch_monitor.stop() + # self._default_executor.release_trainer(trainer_instance) + #else: + + self._default_executor.run_from_dataset(trainer_instance) + #self._default_executor.release_trainer(trainer_instance) + + return trainer_instance + def train_from_dataset(self, program=None, dataset=None, diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py index 26085ec846512eefd3df962c88e56228daf34784..9be1fe92d1d0c77dd809a90ab00585ab5997f0a2 100644 --- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py +++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py @@ -149,6 +149,16 @@ class Fleet(object): """ return self._role_maker.is_server() + def is_xpu(self): + """ + Check whether the node is an instance of server. + + Returns: + bool: True if this is a node of server, + False if not. + """ + return self._role_maker.is_xpu() + def split_files(self, files): """ split files before distributed training, diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py index 8596bd05a8685f6c4feccdeecd295fd10abb09c9..7f8db694d3601be072ab30ffbbd345b25ffafd80 100644 --- a/python/paddle/fluid/incubate/fleet/base/role_maker.py +++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py @@ -28,6 +28,7 @@ __all__ = [ class Role: WORKER = 1 SERVER = 2 + XPU = 3 class MockBarrier(object): @@ -988,6 +989,147 @@ class GeneralRoleMaker(RoleMakerBase): http_server.stop() +class HeterRoleMaker(GeneralRoleMaker): + """ + This role maker is for general use, you can set os.environ to customize: + PADDLE_PSERVERS_IP_PORT_LIST : all pservers' ip:port, separated by ',' + PADDLE_TRAINER_ENDPOINTS : all trainers' ip:port, separated by ',' + TRAINING_ROLE : TRAINER or PSERVER + PADDLE_TRAINER_ID : current trainer id (only for trainer), + it is index in PADDLE_TRAINER_ENDPOINTS + PADDLE_PSERVER_ID : current pserver id (only for pserver) + it is index in PADDLE_PSERVERS_IP_PORT_LIST + """ + + def generate_role(self): + """ + generate role for general role maker + """ + if not self._role_is_generated: + eplist = os.environ["PADDLE_PSERVERS_IP_PORT_LIST"].split(",") + training_role = os.environ["TRAINING_ROLE"] + worker_endpoints = os.environ["PADDLE_TRAINER_ENDPOINTS"].split(",") + trainers_num = len(worker_endpoints) + xpu_endpoints = os.environ["PADDLE_XPU_ENDPOINTS"].split(",") + xpu_num = len(xpu_endpoints) + if training_role not in ["TRAINER", "PSERVER", "XPU"]: + raise ValueError( + "TRAINING_ROLE must be PSERVER or TRAINER or XPU") + if training_role == "TRAINER": + role = Role.WORKER + current_id = int(os.environ["PADDLE_TRAINER_ID"]) + self._node_type = 1 + self._cur_endpoint = worker_endpoints[current_id] + gloo = fluid.core.Gloo() + gloo.init(current_id, + len(worker_endpoints), + self._hdfs_path.rstrip("/") + "/trainer", + self._hdfs_name, self._hdfs_ugi, self._iface, + self._prefix) + self._node_type_comm = gloo + elif training_role == "XPU": + role = Role.XPU + current_id = int(os.environ["PADDLE_XPU_ID"]) + self._node_type = 2 + self._cur_endpoint = xpu_endpoints[current_id] + gloo = fluid.core.Gloo() + gloo.init(current_id, + len(xpu_endpoints), + self._hdfs_path.rstrip("/") + "/xpu", self._hdfs_name, + self._hdfs_ugi, self._iface, self._prefix) + self._node_type_comm = gloo + elif training_role == "PSERVER": + role = Role.SERVER + if os.environ.get("PADDLE_PSERVER_ID") is not None: + current_id = int(os.environ["PADDLE_PSERVER_ID"]) + cur_endpoint = eplist[current_id] + else: + # this is for compatible with paddlecloud + cur_ip = os.environ["POD_IP"] + cur_port = os.environ["PADDLE_PORT"] + cur_endpoint = ":".join([cur_ip, cur_port]) + current_id = eplist.index(cur_endpoint) + self._node_type = 0 + self._cur_endpoint = cur_endpoint + gloo = fluid.core.Gloo() + gloo.init(current_id, + len(eplist), + self._hdfs_path.rstrip("/") + "/pserver", + self._hdfs_name, self._hdfs_ugi, self._iface, + self._prefix) + self._node_type_comm = gloo + + if training_role == "TRAINER" or training_role == "XPU": + gloo = fluid.core.Gloo() + heter_list = worker_endpoints + xpu_endpoints + gloo.init( + heter_list.index(self._cur_endpoint), + len(heter_list), + self._hdfs_path.rstrip("/") + "/heter", self._hdfs_name, + self._hdfs_ugi, self._iface, self._prefix) + self._heter_comm = gloo + + gloo = fluid.core.Gloo() + all_list = worker_endpoints + eplist + xpu_endpoints + gloo.init( + all_list.index(self._cur_endpoint), + len(all_list), + self._hdfs_path.rstrip("/") + "/all", self._hdfs_name, + self._hdfs_ugi, self._iface, self._prefix) + + self._all_comm = gloo + self._trainers_num = trainers_num + self._server_endpoints = eplist + self._role = role + self._current_id = current_id + self._rank = all_list.index(self._cur_endpoint) + self._size = len(all_list) + self._worker_endpoints = worker_endpoints + self._xpu_endpoints = xpu_endpoints + self._role_is_generated = True + + def is_xpu(self): + """ + whether current process is server + """ + if not self._role_is_generated: + self.generate_role() + return self._role == Role.XPU + + def is_first_xpu(self): + """ + whether current process is worker of rank 0 + """ + if not self._role_is_generated: + self.generate_role() + return self._role == Role.XPU and self._current_id == 0 + + def _barrier_xpu(self): + """ + barrier all workers in current distributed job + """ + if not self._role_is_generated: + self.generate_role() + if self.is_xpu(): + self._node_type_comm.barrier() + + def _barrier_heter(self): + """ + barrier all workers in current distributed job + """ + if not self._role_is_generated: + self.generate_role() + if self.is_xpu() or self.is_worker: + self._heter_comm.barrier() + + def xpu_num(self): + """ + """ + if not self._role_is_generated: + self.generate_role() + return len(self._xpu_endpoints) + + class UserDefinedRoleMaker(RoleMakerBase): """ UserDefinedRoleMaker is designed for worker and server assignment diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index 402250455f79dee24bc87ea7fb9136ae24a68e23..2a1945532e654605d2e2d45206daa3cd8306737f 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -23,6 +23,7 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Fleet from paddle.fluid.incubate.fleet.base.mode import Mode from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker +from paddle.fluid.incubate.fleet.base.role_maker import HeterRoleMaker class PSLib(Fleet): @@ -44,6 +45,9 @@ class PSLib(Fleet): role_maker = MPISymetricRoleMaker() super(PSLib, self).init(role_maker) self._fleet_ptr = fluid.core.Fleet() + self._heter_ptr = None + if isinstance(role_maker, HeterRoleMaker): + self._heter_ptr = fluid.core.Heter() def _set_client_communication_config(self, request_timeout_ms, connect_timeout_ms, max_retry): @@ -77,23 +81,35 @@ class PSLib(Fleet): raise Exception( "You should run DistributedOptimizer.minimize() first") # barrier_all for init_server, wait for server starts + if isinstance(self._role_maker, HeterRoleMaker): + if self._role_maker.is_xpu(): + local_endpoint = self._role_maker.get_local_endpoint() + local_endpoint = local_endpoint.split(":") + self._heter_ptr.start_xpu_service( + str(local_endpoint[0]), int(local_endpoint[1])) self._role_maker._barrier_all() self.all_ips_ = self._role_maker._all_gather(self._local_ip) # worker_index * 2 is for compatible with older versions of pslib self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_, self._role_maker._get_size(), self._role_maker.worker_index() * 2) + if isinstance(self._role_maker, HeterRoleMaker): + if self._role_maker.is_worker(): + self._heter_ptr.set_xpu_list( + self._role_maker._xpu_endpoints) + self._heter_ptr.create_client2xpu_connection() # barrier_all for init_worker self._role_maker._barrier_all() # prepare for client to client communication - info = self._fleet_ptr.get_clients_info() - all_info = self._role_maker._worker_gather(info[0]) - self._fleet_ptr.gather_clients(all_info) - self._fleet_ptr.set_client2client_config( - self._client2client_request_timeout_ms, - self._client2client_connect_timeout_ms, - self._client2client_max_retry) - self._fleet_ptr.create_client2client_connection() + if self._role_maker.is_worker(): + info = self._fleet_ptr.get_clients_info() + all_info = self._role_maker._worker_gather(info[0]) + self._fleet_ptr.gather_clients(all_info) + self._fleet_ptr.set_client2client_config( + self._client2client_request_timeout_ms, + self._client2client_connect_timeout_ms, + self._client2client_max_retry) + self._fleet_ptr.create_client2client_connection() # barrier for init model self._role_maker._barrier_worker() if self._role_maker.is_first_worker(): @@ -144,10 +160,16 @@ class PSLib(Fleet): >>> fleet.init_server("/you/path/to/model", mode = 0) """ mode = kwargs.get("mode", 0) - self._role_maker._barrier_worker() - if self._role_maker.is_first_worker(): - self._fleet_ptr.load_model(model_dir, mode) - self._role_maker._barrier_worker() + if isinstance(self._role_maker, HeterRoleMaker): + self._role_maker._barrier_xpu() + if self._role_maker.is_first_xpu(): + self._fleet_ptr.load_model(model_dir, mode) + self._role_maker._barrier_xpu() + else: + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.load_model(model_dir, mode) + self._role_maker._barrier_worker() def run_server(self): """ @@ -185,6 +207,54 @@ class PSLib(Fleet): raise Exception( "You should run DistributedOptimizer.minimize() first") + def end_pass(self, scope): + if self._role_maker.worker_index() < self._role_maker.xpu_num(): + self._heter_ptr.end_pass(scope, self._role_maker.worker_index()) + self._heter_ptr.stop_xpu_service(self._role_maker.worker_index()) + + def train_from_dataset(self, + executor, + program=None, + dataset=None, + scope=None, + thread=0, + debug=False, + fetch_list=None, + fetch_info=None, + print_period=100, + fetch_handler=None): + """ + + """ + + if self._role_maker.is_worker(): + self._role_maker._barrier_heter() + executor.train_from_dataset(program, dataset, scope, thread, debug, + fetch_list, fetch_info, print_period, + fetch_handler) + + def start_heter_trainer(self, + executor, + program=None, + scope=None, + debug=False, + fetch_list=None, + fetch_info=None, + print_period=100, + fetch_handler=None): + """ + + """ + + trainer_instance = executor.start_heter_trainer( + program, scope, debug, fetch_list, fetch_info, print_period, + fetch_handler) + if self._role_maker.is_xpu(): + print("barrier heter") + self._role_maker._barrier_heter() + print("barrier heter") + executor._default_executor.release_trainer(trainer_instance) + def stop_worker(self): """ stop(): will be called after a user finishes his/her training task. Fleet instance will be @@ -197,6 +267,7 @@ class PSLib(Fleet): self._role_maker._barrier_worker() if self._role_maker.is_first_worker(): self._fleet_ptr.stop_server() + self._heter_ptr.stop_xpu_service() self._role_maker._barrier_worker() self._role_maker._barrier_all() self._role_maker._finalize() diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index 232d3e0422e5542e1fd13efd80486ff9bb3d4a22..5cd1aa884a928db4980933091d951010ce347444 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -509,13 +509,15 @@ class DistributedAdam(DistributedOptimizerImplBase): opt_info = {} opt_info["program_id_to_worker"] = prog_id_to_worker opt_info["program_configs"] = program_configs - opt_info["trainer"] = "DistMultiTrainer" + opt_info["trainer"] = strategy.get("trainer", "DistMultiTrainer") opt_info["device_worker"] = strategy.get("device_worker", "DownpourSGD") opt_info["optimizer"] = "DownpourSGD" opt_info["fleet_desc"] = ps_param opt_info["worker_skipped_ops"] = worker_skipped_ops opt_info["use_cvm"] = strategy.get("use_cvm", False) opt_info["no_cvm"] = strategy.get("no_cvm", False) + opt_info["worker_class"] = strategy.get("worker_class", + "DownpourWorker") opt_info["stat_var_names"] = strategy.get("stat_var_names", []) opt_info["local_tables"] = strategy.get("local_tables", []) opt_info["async_tables"] = strategy.get("async_tables", []) @@ -529,6 +531,7 @@ class DistributedAdam(DistributedOptimizerImplBase): opt_info["dump_file_num"] = strategy.get("dump_file_num", 16) opt_info["dump_fields_path"] = strategy.get("dump_fields_path", "") opt_info["dump_param"] = strategy.get("dump_param", []) + opt_info["worker_places"] = strategy.get("worker_places", []) if server._server.downpour_server_param.downpour_table_param[ 0].accessor.accessor_class in [ "DownpourCtrAccessor", "DownpourCtrDoubleAccessor", diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py index 3ae61891514ccaa96cc8e7429d1a988a4618173a..cb1a54ef19899059d1a46d0807ce58bf3b5ab8b5 100644 --- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py +++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py @@ -14,6 +14,7 @@ """Fleet Utils.""" import collections +import copy import json import logging import math @@ -1615,3 +1616,123 @@ class FleetUtil(object): """ program = utils.load_program(prog_path, is_text) utils.parse_program(program, output_dir) + + def split_program_by_device(self, program): + ops_list = [] + type_list = [] + pre = None + type_cpu = "cpu" + for op in program.global_block().ops: + if op.has_attr("op_device"): + if pre is None or pre != op.attr("op_device"): + ops_list.append([]) + type_list.append( + op.attr("op_device") + if op.attr("op_device") != "" else type_cpu) + ops_list[-1].append(op) + pre = op.attr("op_device") + l = len(type_list) + i = 0 + type_heter = None + while i < l: + while i < l and type_list[i] == type_cpu: + i += 1 + if i == l: + break + + type_heter = type_list[i] + i += 1 + start = i + valid = True + while i < l and type_list[i] != type_heter: + if type_list[i] != type_cpu: + valid = False + break + i += 1 + + if i == l: + break + elif not valid: + continue + + for j in range(start, i): + for op in ops_list[j]: + op._set_attr("op_device", type_heter) + type_list[j] = type_heter + j += 1 + + pre = None + merged_ops_list = [] + merged_type_list = [] + for i in range(l): + if pre is None or pre != type_list[i]: + merged_ops_list.append([]) + merged_type_list.append(type_list[i]) + merged_ops_list[-1].extend(ops_list[i]) + pre = type_list[i] + + data_vars = set() + for k in program.global_block().vars: + var = program.global_block().var(k) + if not var.persistable: + data_vars.add(var.name) + + l = len(merged_ops_list) + inputs_pre = set() + outputs_pre = set() + in_from_pre = [[] for i in range(l)] + for i in range(l): + inputs = set() + outputs = set() + for op in merged_ops_list[i]: + for input in op.input_names: + for tmp in op.input(input): + if tmp not in outputs: + inputs.add(tmp) + for output in op.output_names: + for tmp in op.output(output): + outputs.add(tmp) + if i == 0: + in_from_pre[i] = [] + elif i == 1: + in_from_pre[i] = (outputs_pre | data_vars) & inputs + else: + in_from_pre[i] = outputs_pre & inputs + inputs_pre = copy.deepcopy(inputs) + outputs_pre = copy.deepcopy(outputs) + + l = len(in_from_pre) + start_list = [] + end_list = [] + send_list = [[] for i in range(l)] + sum = 0 + program_list = [] + for i in range(l): + start_list.append(sum) + end_list.append(sum + len(merged_ops_list[i]) - 1) + sum += len(merged_ops_list[i]) + if i < l - 1: + send_list[i].extend(list(in_from_pre[i + 1])) + prog = program.clone() + if merged_type_list[i] != type_cpu: + prog = prog._prune_with_input( + list(in_from_pre[i]), list(send_list[i])) + program_list.append(prog) + else: + program_list.append(prog) + recv_list = [list(i) for i in in_from_pre] + found = False + heter_index = None + for i in range(len(merged_type_list)): + t = merged_type_list[i] + if t != type_cpu: + if found: + print("only one region of program can be heter") + found = True + heter_index = i + if heter_index is None: + print("warning: non heter program") + return None + else: + return [start_list[heter_index], end_list[heter_index], send_list[heter_index], \ + recv_list[heter_index], program_list[heter_index]] diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 7906f563c0009ac37695f50c9dc2b035b8f004aa..ea6abe2d335e6669b27ba278c0faaca62ca0fdbb 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -471,9 +471,9 @@ def rpn_target_assign(bbox_pred, def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25): """ - :alias_main: paddle.nn.functional.sigmoid_focal_loss - :alias: paddle.nn.functional.sigmoid_focal_loss,paddle.nn.functional.loss.sigmoid_focal_loss - :old_api: paddle.fluid.layers.sigmoid_focal_loss + :alias_main: paddle.nn.functional.sigmoid_focal_loss + :alias: paddle.nn.functional.sigmoid_focal_loss,paddle.nn.functional.loss.sigmoid_focal_loss + :old_api: paddle.fluid.layers.sigmoid_focal_loss **Sigmoid Focal Loss Operator.** @@ -628,9 +628,9 @@ def detection_output(loc, nms_eta=1.0, return_index=False): """ - :alias_main: paddle.nn.functional.detection_output - :alias: paddle.nn.functional.detection_output,paddle.nn.functional.vision.detection_output - :old_api: paddle.fluid.layers.detection_output + :alias_main: paddle.nn.functional.detection_output + :alias: paddle.nn.functional.detection_output,paddle.nn.functional.vision.detection_output + :old_api: paddle.fluid.layers.detection_output Given the regression locations, classification confidences and prior boxes, calculate the detection outputs by performing following steps: @@ -761,9 +761,9 @@ def detection_output(loc, @templatedoc() def iou_similarity(x, y, box_normalized=True, name=None): """ - :alias_main: paddle.nn.functional.iou_similarity - :alias: paddle.nn.functional.iou_similarity,paddle.nn.functional.loss.iou_similarity - :old_api: paddle.fluid.layers.iou_similarity + :alias_main: paddle.nn.functional.iou_similarity + :alias: paddle.nn.functional.iou_similarity,paddle.nn.functional.loss.iou_similarity + :old_api: paddle.fluid.layers.iou_similarity ${comment} @@ -821,9 +821,9 @@ def box_coder(prior_box, name=None, axis=0): """ - :alias_main: paddle.nn.functional.box_coder - :alias: paddle.nn.functional.box_coder,paddle.nn.functional.vision.box_coder - :old_api: paddle.fluid.layers.box_coder + :alias_main: paddle.nn.functional.box_coder + :alias: paddle.nn.functional.box_coder,paddle.nn.functional.vision.box_coder + :old_api: paddle.fluid.layers.box_coder **Box Coder Layer** @@ -1012,9 +1012,9 @@ def yolov3_loss(x, name=None, scale_x_y=1.): """ - :alias_main: paddle.nn.functional.yolov3_loss - :alias: paddle.nn.functional.yolov3_loss,paddle.nn.functional.vision.yolov3_loss - :old_api: paddle.fluid.layers.yolov3_loss + :alias_main: paddle.nn.functional.yolov3_loss + :alias: paddle.nn.functional.yolov3_loss,paddle.nn.functional.vision.yolov3_loss + :old_api: paddle.fluid.layers.yolov3_loss ${comment} @@ -1139,9 +1139,9 @@ def yolo_box(x, name=None, scale_x_y=1.): """ - :alias_main: paddle.nn.functional.yolo_box - :alias: paddle.nn.functional.yolo_box,paddle.nn.functional.vision.yolo_box - :old_api: paddle.fluid.layers.yolo_box + :alias_main: paddle.nn.functional.yolo_box + :alias: paddle.nn.functional.yolo_box,paddle.nn.functional.vision.yolo_box + :old_api: paddle.fluid.layers.yolo_box ${comment} @@ -1318,9 +1318,9 @@ def bipartite_match(dist_matrix, dist_threshold=None, name=None): """ - :alias_main: paddle.nn.functional.bipartite_match - :alias: paddle.nn.functional.bipartite_match,paddle.nn.functional.vision.bipartite_match - :old_api: paddle.fluid.layers.bipartite_match + :alias_main: paddle.nn.functional.bipartite_match + :alias: paddle.nn.functional.bipartite_match,paddle.nn.functional.vision.bipartite_match + :old_api: paddle.fluid.layers.bipartite_match This operator implements a greedy bipartite matching algorithm, which is used to obtain the matching with the maximum distance based on the input @@ -1412,9 +1412,9 @@ def target_assign(input, mismatch_value=None, name=None): """ - :alias_main: paddle.nn.functional.target_assign - :alias: paddle.nn.functional.target_assign,paddle.nn.functional.extension.target_assign - :old_api: paddle.fluid.layers.target_assign + :alias_main: paddle.nn.functional.target_assign + :alias: paddle.nn.functional.target_assign,paddle.nn.functional.extension.target_assign + :old_api: paddle.fluid.layers.target_assign This operator can be, for given the target bounding boxes or labels, to assign classification and regression targets to each prediction as well as @@ -1530,9 +1530,9 @@ def ssd_loss(location, normalize=True, sample_size=None): """ - :alias_main: paddle.nn.functional.ssd_loss - :alias: paddle.nn.functional.ssd_loss,paddle.nn.functional.loss.ssd_loss - :old_api: paddle.fluid.layers.ssd_loss + :alias_main: paddle.nn.functional.ssd_loss + :alias: paddle.nn.functional.ssd_loss,paddle.nn.functional.loss.ssd_loss + :old_api: paddle.fluid.layers.ssd_loss **Multi-box loss layer for object detection algorithm of SSD** @@ -1777,9 +1777,9 @@ def prior_box(input, name=None, min_max_aspect_ratios_order=False): """ - :alias_main: paddle.nn.functional.prior_box - :alias: paddle.nn.functional.prior_box,paddle.nn.functional.vision.prior_box - :old_api: paddle.fluid.layers.prior_box + :alias_main: paddle.nn.functional.prior_box + :alias: paddle.nn.functional.prior_box,paddle.nn.functional.vision.prior_box + :old_api: paddle.fluid.layers.prior_box This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm. Each position of the input produce N prior boxes, N is determined by @@ -1938,9 +1938,9 @@ def density_prior_box(input, flatten_to_2d=False, name=None): """ - :alias_main: paddle.nn.functional.density_prior_box - :alias: paddle.nn.functional.density_prior_box,paddle.nn.functional.vision.density_prior_box - :old_api: paddle.fluid.layers.density_prior_box + :alias_main: paddle.nn.functional.density_prior_box + :alias: paddle.nn.functional.density_prior_box,paddle.nn.functional.vision.density_prior_box + :old_api: paddle.fluid.layers.density_prior_box This op generates density prior boxes for SSD(Single Shot MultiBox Detector) @@ -2130,7 +2130,7 @@ def multi_box_head(inputs, name=None, min_max_aspect_ratios_order=False): """ - :api_attr: Static Graph + :api_attr: Static Graph Base on SSD ((Single Shot MultiBox Detector) algorithm, generate prior boxes, regression location and classification confidence on multiple input feature @@ -2407,9 +2407,9 @@ def anchor_generator(input, offset=0.5, name=None): """ - :alias_main: paddle.nn.functional.anchor_generator - :alias: paddle.nn.functional.anchor_generator,paddle.nn.functional.vision.anchor_generator - :old_api: paddle.fluid.layers.anchor_generator + :alias_main: paddle.nn.functional.anchor_generator + :alias: paddle.nn.functional.anchor_generator,paddle.nn.functional.vision.anchor_generator + :old_api: paddle.fluid.layers.anchor_generator **Anchor generator operator** @@ -2612,9 +2612,9 @@ def generate_proposal_labels(rpn_rois, is_cls_agnostic=False, is_cascade_rcnn=False): """ - :alias_main: paddle.nn.functional.generate_proposal_labels - :alias: paddle.nn.functional.generate_proposal_labels,paddle.nn.functional.vision.generate_proposal_labels - :old_api: paddle.fluid.layers.generate_proposal_labels + :alias_main: paddle.nn.functional.generate_proposal_labels + :alias: paddle.nn.functional.generate_proposal_labels,paddle.nn.functional.vision.generate_proposal_labels + :old_api: paddle.fluid.layers.generate_proposal_labels **Generate Proposal Labels of Faster-RCNN** @@ -2737,9 +2737,9 @@ def generate_proposal_labels(rpn_rois, def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois, labels_int32, num_classes, resolution): """ - :alias_main: paddle.nn.functional.generate_mask_labels - :alias: paddle.nn.functional.generate_mask_labels,paddle.nn.functional.vision.generate_mask_labels - :old_api: paddle.fluid.layers.generate_mask_labels + :alias_main: paddle.nn.functional.generate_mask_labels + :alias: paddle.nn.functional.generate_mask_labels,paddle.nn.functional.vision.generate_mask_labels + :old_api: paddle.fluid.layers.generate_mask_labels **Generate Mask Labels for Mask-RCNN** @@ -2896,9 +2896,9 @@ def generate_proposals(scores, name=None, return_rois_num=False): """ - :alias_main: paddle.nn.functional.generate_proposals - :alias: paddle.nn.functional.generate_proposals,paddle.nn.functional.vision.generate_proposals - :old_api: paddle.fluid.layers.generate_proposals + :alias_main: paddle.nn.functional.generate_proposals + :alias: paddle.nn.functional.generate_proposals,paddle.nn.functional.vision.generate_proposals + :old_api: paddle.fluid.layers.generate_proposals **Generate proposal Faster-RCNN** diff --git a/python/paddle/fluid/layers/device.py b/python/paddle/fluid/layers/device.py index 78226a5201707a192b6fa38e11bfc243f5815a55..42ccdbb8d267a5d8a8198623a1818379885243ee 100644 --- a/python/paddle/fluid/layers/device.py +++ b/python/paddle/fluid/layers/device.py @@ -20,12 +20,12 @@ from __future__ import print_function from .layer_function_generator import autodoc from ..framework import unique_name from ..layer_helper import LayerHelper -from ..annotations import deprecated +from paddle.utils import deprecated __all__ = [] -@deprecated(since='0.15.0', instead="ParallelExecutor") +@deprecated(since='0.15.0', update_to="paddle.fluid.ParallelExecutor") @autodoc() def get_places(device_count=None, device_type=None): helper = LayerHelper('get_places', **locals()) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 46fb61745aeb4748eed409d75fe82a11d78c0837..1b8df4a098ff1c11270854976d82693932627477 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -37,6 +37,7 @@ from functools import reduce from .. import core from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype import paddle +from paddle.utils import deprecated __all__ = [ 'fc', @@ -9910,7 +9911,7 @@ def flatten(x, axis=1, name=None): return out -def stack(x, axis=0): +def stack(x, axis=0, name=None): """ This OP stacks all the inputs :code:`x` along axis. @@ -9990,15 +9991,16 @@ def stack(x, axis=0): data = layers.stack(x1) # stack according to axis 0, data.shape=[1, None, 1, 2] """ - - helper = LayerHelper('stack', **locals()) axis = 0 if axis is None else axis - if not isinstance(x, list) and not isinstance(x, tuple): x = [x] + + if in_dygraph_mode(): + return core.ops.stack(x, 'axis', axis) + + helper = LayerHelper('stack', **locals()) out = helper.create_variable_for_type_inference(x[0].dtype) - if not in_dygraph_mode() and \ - x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY: + if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY: assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \ "number of the elements must be 1, but received %s." % len(x) out_index = helper.create_variable_for_type_inference(dtype="int32") @@ -11614,6 +11616,7 @@ Examples: return _elementwise_op(LayerHelper('elementwise_sub', **locals())) +@deprecated(since="2.0.0", update_to="paddle.multiply") def elementwise_mul(x, y, axis=-1, act=None, name=None): """ :alias_main: paddle.elementwise_mul diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 0289ecea34acf65d01aa13b555ee523f7127b48d..1f96bbc4ceeac142d802ef94a18f5ace96ce820d 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -28,7 +28,6 @@ from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_bu from .unique_name import UniqueNameGenerator import logging import warnings -from .dataset import DatasetBase, InMemoryDataset ### Dygraph DataLoader configs ### import os @@ -1670,7 +1669,7 @@ class PyReader(DataLoaderBase): class DatasetLoader(DataLoaderBase): def __init__(self, dataset, places, drop_last): - assert isinstance(dataset, + assert isinstance(dataset, paddle.fleet.dataset. DatasetBase), "dataset must be type of DatasetBase" assert not in_dygraph_mode( ), "DatasetLoader is not supported in dygraph mode yet" @@ -1686,7 +1685,7 @@ class DatasetLoader(DataLoaderBase): dataset.set_thread(thread_num) - if isinstance(dataset, + if isinstance(dataset, paddle.fleet.dataset. InMemoryDataset) and dataset.queue_num > thread_num: logging.warn("queue_num {} which is set in Dataset is ignored". format(dataset.queue_num)) diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py index 6bf95b9d6715bfade20069eec130a676d7edeb55..56ca3105dea792ed964d62057c07e2da7a4140fa 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py @@ -210,7 +210,7 @@ class TestDistCTR2x2(FleetDistRunnerBase): filelist.append(train_file_path) # config dataset - dataset = fluid.DatasetFactory().create_dataset() + dataset = paddle.fleet.DatasetFactory().create_dataset() dataset.set_batch_size(batch_size) dataset.set_use_var(self.feeds) pipe_command = 'python ctr_dataset_reader.py' diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..cfbbf7de22087d13aed1f8293d362aead5ae03b3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py @@ -0,0 +1,78 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.test_fusion_gru_op import TestFusionGRUOp + + +class TestFusionGRUMKLDNNOp(TestFusionGRUOp): + def set_confs(self): + self.use_mkldnn = True + + +class TestFusionGRUMKLDNNOpNoInitial(TestFusionGRUOp): + def set_confs(self): + self.with_h0 = False + self.use_mkldnn = True + + +class TestFusionGRUMKLDNNOpNoBias(TestFusionGRUOp): + def set_confs(self): + self.with_bias = False + self.use_mkldnn = True + + +class TestFusionGRUMKLDNNOpReverse(TestFusionGRUOp): + def set_confs(self): + self.is_reverse = True + self.use_mkldnn = True + + +class TestFusionGRUMKLDNNOpOriginMode(TestFusionGRUOp): + def set_confs(self): + self.origin_mode = True + self.use_mkldnn = True + + +class TestFusionGRUMKLDNNOpMD1(TestFusionGRUOp): + def set_confs(self): + self.M = 36 + self.D = 8 + self.use_mkldnn = True + + +class TestFusionGRUMKLDNNOpMD2(TestFusionGRUOp): + def set_confs(self): + self.M = 8 + self.D = 8 + self.use_mkldnn = True + + +class TestFusionGRUMKLDNNOpMD3(TestFusionGRUOp): + def set_confs(self): + self.M = 17 + self.D = 15 + self.use_mkldnn = True + + +class TestFusionGRUMKLDNNOpBS1(TestFusionGRUOp): + def set_confs(self): + self.lod = [[3]] + self.D = 16 + self.use_mkldnn = True + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py index cc2cee602918d53dd5435d9f498a9e8c9c948c58..90d5f58539500b87e999bec4c475ec40d0f55483 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_dataset.py @@ -17,6 +17,7 @@ including create, config, run, etc. """ from __future__ import print_function +import paddle import paddle.fluid as fluid import paddle.compat as cpt import paddle.fluid.core as core @@ -37,23 +38,26 @@ class TestDataset(unittest.TestCase): def test_dataset_create(self): """ Testcase for dataset create. """ try: - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") except: self.assertTrue(False) try: - dataset = fluid.DatasetFactory().create_dataset("QueueDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "QueueDataset") except: self.assertTrue(False) try: - dataset = fluid.DatasetFactory().create_dataset( + dataset = paddle.fleet.DatasetFactory().create_dataset( "FileInstantDataset") except: self.assertTrue(False) try: - dataset = fluid.DatasetFactory().create_dataset("MyOwnDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "MyOwnDataset") self.assertTrue(False) except: self.assertTrue(True) @@ -91,7 +95,8 @@ class TestDataset(unittest.TestCase): name=slot, shape=[1], dtype="int64", lod_level=1) slots_vars.append(var) - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist( @@ -125,7 +130,7 @@ class TestDataset(unittest.TestCase): dataset.set_trainer_num(4) dataset.set_hdfs_config("my_fs_name", "my_fs_ugi") dataset.set_download_cmd("./read_from_afs my_fs_name my_fs_ugi") - dataset.enable_pv_merge() + dataset.set_enable_pv_merge(False) thread_num = dataset.get_thread_num() self.assertEqual(thread_num, 12) @@ -171,7 +176,8 @@ class TestDataset(unittest.TestCase): name=slot, shape=[1], dtype="int64", lod_level=1) slots_vars.append(var) - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist([filename1, filename2]) @@ -222,7 +228,8 @@ class TestDataset(unittest.TestCase): name=slot, shape=[1], dtype="int64", lod_level=1) slots_vars.append(var) - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist([ @@ -293,7 +300,8 @@ class TestDataset(unittest.TestCase): name=slot, shape=[1], dtype="float32", lod_level=1) slots_vars.append(var) - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(1) dataset.set_parse_ins_id(True) @@ -359,7 +367,8 @@ class TestDataset(unittest.TestCase): name="slot4", shape=[1], dtype="float32", lod_level=0) slots_vars = [var1, var2, var3, var4] - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(1) dataset.set_parse_ins_id(True) @@ -414,7 +423,8 @@ class TestDataset(unittest.TestCase): name=slot, shape=[1], dtype="float32", lod_level=1) slots_vars.append(var) - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist([ @@ -507,7 +517,7 @@ class TestDataset(unittest.TestCase): name=slot, shape=[1], dtype="int64", lod_level=1) slots_vars.append(var) - dataset = fluid.DatasetFactory().create_dataset("QueueDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist( @@ -532,7 +542,7 @@ class TestDataset(unittest.TestCase): except Exception as e: self.assertTrue(False) - dataset2 = fluid.DatasetFactory().create_dataset("QueueDataset") + dataset2 = paddle.fleet.DatasetFactory().create_dataset("QueueDataset") dataset2.set_use_var(slots_vars) dataset2.set_batch_size(32) dataset2.set_thread(3) @@ -573,7 +583,7 @@ class TestDataset(unittest.TestCase): name=slot, shape=[1], dtype="float32", lod_level=1) slots_vars.append(var) - dataset = fluid.DatasetFactory().create_dataset("QueueDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist( @@ -628,7 +638,8 @@ class TestDataset(unittest.TestCase): name=slot, shape=[None, 1], dtype="int64", lod_level=1) slots_vars.append(var) - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") dataset.set_input_type(1) dataset.set_batch_size(1) dataset.set_thread(2) @@ -707,7 +718,7 @@ class TestDatasetWithFetchHandler(unittest.TestCase): inputs(list): inputs of get_dataset files(list): files of get_dataset """ - dataset = fluid.DatasetFactory().create_dataset("QueueDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist(files) @@ -864,7 +875,8 @@ class TestDataset2(unittest.TestCase): except ImportError as e: print("warning: no mpi4py") exe.run(startup_program) - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist([ @@ -884,9 +896,6 @@ class TestDataset2(unittest.TestCase): """ Testcase for InMemoryDataset from create to run. """ - - self.skipTest("parameter server will add pslib UT later") - with open("test_in_memory_dataset2_run2_a.txt", "w") as f: data = "1 1 2 3 3 4 5 5 5 5 1 1\n" data += "1 2 2 3 4 4 6 6 6 6 1 2\n" @@ -902,7 +911,7 @@ class TestDataset2(unittest.TestCase): train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet + from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet with fluid.program_guard(train_program, startup_program): slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"] slots_vars = [] @@ -936,7 +945,8 @@ class TestDataset2(unittest.TestCase): except ImportError as e: print("warning: no mpi4py") exe.run(startup_program) - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist([ @@ -952,6 +962,63 @@ class TestDataset2(unittest.TestCase): print("warning: catch expected error") fleet._opt_info = None fleet._fleet_ptr = None + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") + dataset.set_rank_offset("") + dataset.set_pv_batch_size(1) + dataset.set_hdfs_config("", "") + d = paddle.fleet.DatasetBase() + try: + dataset.set_feed_type("MultiSlotInMemoryDataFeed") + except: + print("warning: catch expected error") + dataset.thread_num = 0 + try: + dataset._prepare_to_run() + except: + print("warning: catch expected error") + dataset.set_parse_logkey(True) + dataset.set_merge_by_sid(True) + dataset.set_enable_pv_merge(True) + try: + dataset.preprocess_instance() + except: + print("warning: catch expected error") + try: + dataset.set_current_phase(1) + except: + print("warning: catch expected error") + try: + dataset.postprocess_instance() + except: + print("warning: catch expected error") + dataset.set_fleet_send_batch_size(1024) + try: + dataset.global_shuffle() + except: + print("warning: catch expected error") + dataset.get_pv_data_size() + dataset.get_memory_data_size() + dataset.get_shuffle_data_size() + dataset = paddle.fleet.DatasetFactory().create_dataset( + "QueueDataset") + try: + dataset.local_shuffle() + except: + print("warning: catch expected error") + try: + dataset.global_shuffle() + except: + print("warning: catch expected error") + dataset = paddle.fleet.FileInstantDataset() + try: + dataset.local_shuffle() + except: + print("warning: catch expected error") + try: + dataset.global_shuffle() + except: + print("warning: catch expected error") os.remove("./test_in_memory_dataset2_run2_a.txt") os.remove("./test_in_memory_dataset2_run2_b.txt") diff --git a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py index 10aefbb222bb029c48648ce27ead4666b15dfc4d..22d59e78fff867279880ddc283e096f4848512d0 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py +++ b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle import paddle.fluid as fluid import numpy as np import six @@ -96,7 +97,8 @@ class DatasetLoaderTestBase(unittest.TestCase): def check_batch_number(self, place, randomize_batch_num=False): main_prog, startup_prog, feeds = self.build_network() - dataset = fluid.DatasetFactory().create_dataset(self.dataset_name) + dataset = paddle.fleet.DatasetFactory().create_dataset( + self.dataset_name) dataset.set_batch_size(BATCH_SIZE) if isinstance(place, fluid.CPUPlace): diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py new file mode 100644 index 0000000000000000000000000000000000000000..6d67afe6cbfbb036ef54738a72d86ed798625112 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py @@ -0,0 +1,204 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle.fluid as fluid +import paddle +from op_test import OpTest + + +class TestFlattenOp(OpTest): + def setUp(self): + self.op_type = "flatten_contiguous_range" + self.start_axis = 0 + self.stop_axis = -1 + self.init_test_case() + self.inputs = {"X": np.random.random(self.in_shape).astype("float64")} + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.in_shape).astype("float32") + } + + def test_check_output(self): + self.check_output(no_check_set=["XShape"]) + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = -1 + self.new_shape = (120) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_1(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 1 + self.stop_axis = 2 + self.new_shape = (3, 10, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_2(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_3(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 2 + self.new_shape = (30, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_4(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = -2 + self.stop_axis = -1 + self.new_shape = (3, 2, 20) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_5(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 2 + self.stop_axis = 2 + self.new_shape = (3, 2, 5, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOpSixDims(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.start_axis = 3 + self.stop_axis = 5 + self.new_shape = (3, 2, 3, 32) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlatten2OpError(unittest.TestCase): + def test_errors(self): + image_shape = (2, 3, 4, 4) + x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x = x.astype('float32') + + def test_ValueError1(): + x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32') + out = paddle.flatten(x_var, start_axis=2, stop_axis=1) + + self.assertRaises(ValueError, test_ValueError1) + + def test_ValueError2(): + x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32') + paddle.flatten(x_var, start_axis=10, stop_axis=1) + + self.assertRaises(ValueError, test_ValueError2) + + def test_ValueError3(): + x_var = paddle.nn.data(name="x", shape=image_shape, dtype='float32') + paddle.flatten(x_var, start_axis=2, stop_axis=10) + + self.assertRaises(ValueError, test_ValueError3) + + def test_type(): + # dtype must be float32, float64, int8, int32, int64. + x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x2 = x2.astype('float16') + x2_var = paddle.data(name='x2', shape=[3, 2, 4, 5], dtype='float16') + paddle.flatten(x2_var) + + self.assertRaises(TypeError, test_type) + + def test_InputError(): + out = paddle.flatten(x) + + self.assertRaises(ValueError, test_InputError) + + +class TestFlattenPython(unittest.TestCase): + def test_python_api(self): + image_shape = (2, 3, 4, 4) + x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x = x.astype('float32') + + def test_InputError(): + out = paddle.flatten(x) + + self.assertRaises(ValueError, test_InputError) + + def test_Negative(): + paddle.enable_imperative() + img = paddle.imperative.to_variable(x) + out = paddle.flatten(img, start_axis=-2, stop_axis=-1) + return out.numpy().shape + + res_shape = test_Negative() + self.assertTrue((2, 3, 16) == res_shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py index 88a9d235855ce813ad0abc0f304eb0e8adc35ab9..351dc0a5d0f66d91e063c0ef3ce84cd3756c0860 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py @@ -14,6 +14,7 @@ """Test cases for role makers.""" from __future__ import print_function +import paddle import os import unittest @@ -162,7 +163,8 @@ class TestCloudRoleMaker2(unittest.TestCase): data = "1 1 1 1\n" f.write(data) - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"]) dataset.set_use_var([show, label]) dataset.load_into_memory() diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py index fb7454542587323a8775b066646bb1cd1c79c9ec..d8a5816a42a2fd03ecfaa11f22b602f89a422cda 100644 --- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py @@ -30,6 +30,7 @@ def fusion_gru( wh, # D x 3D bias, # 1 x 3D is_reverse, + origin_mode, act_state, act_gate): return gru(fc(x, wx, bias), @@ -40,7 +41,8 @@ def fusion_gru( (1, wh.shape[1]), dtype='float32'), is_reverse, act_state, - act_gate) + act_gate, + origin_mode=origin_mode) class TestFusionGRUOp(OpTest): @@ -57,6 +59,8 @@ class TestFusionGRUOp(OpTest): self.with_bias = True self.act_state = 'tanh' self.act_gate = 'sigmoid' + self.origin_mode = False + self.use_mkldnn = False self.set_confs() T = sum(self.lod[0]) @@ -73,7 +77,7 @@ class TestFusionGRUOp(OpTest): (N, self.D), dtype='float32') _, _, _, hidden = fusion_gru( - x, self.lod, h0, wx, wh, bias, self.is_reverse, + x, self.lod, h0, wx, wh, bias, self.is_reverse, self.origin_mode, ACTIVATION[self.act_state], ACTIVATION[self.act_gate]) self.inputs = {'X': (x, self.lod), 'WeightX': wx, 'WeightH': wh} @@ -89,7 +93,9 @@ class TestFusionGRUOp(OpTest): self.attrs = { 'activation': self.act_state, 'gate_activation': self.act_gate, - 'is_reverse': self.is_reverse + 'is_reverse': self.is_reverse, + 'origin_mode': self.origin_mode, + 'use_mkldnn': self.use_mkldnn } def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index a1ead2aef63f7b186ed2d5e8a6598349ae50509d..9da70e85f01c0a13a87766a1befbda206c510cbe 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -180,6 +180,51 @@ class TestLayer(LayerTest): self.assertRaises(TypeError, test_type) + def test_Flatten(self): + inp = np.ones([3, 4, 4, 5], dtype='float32') + with self.static_graph(): + t = layers.data( + name='data', + shape=[3, 4, 4, 5], + dtype='float32', + append_batch_size=False) + flatten = nn.Flatten() + ret = flatten(t) + static_ret = self.get_static_graph_result( + feed={'data': inp}, fetch_list=[ret])[0] + with self.dynamic_graph(): + t = base.to_variable(inp) + flatten = nn.Flatten() + dy_ret = flatten(t) + dy_ret_value = dy_ret.numpy() + + self.assertTrue(np.array_equal(static_ret, dy_ret_value)) + + with self.static_graph(): + + # the input of Linear must be Variable. + def test_Variable(): + inp = np.ones([3, 32, 32], dtype='float32') + linear = nn.Linear( + 32, + 4, + bias_attr=fluid.initializer.ConstantInitializer(value=1)) + linear_ret1 = linear(inp) + + self.assertRaises(TypeError, test_Variable) + + # the input dtype of Linear must be float16 or float32 or float64 + # float16 only can be set on GPU place + def test_type(): + inp = np.ones([3, 32, 32], dtype='int32') + linear = nn.Linear( + 32, + 4, + bias_attr=fluid.initializer.ConstantInitializer(value=1)) + linear_ret2 = linear(inp) + + self.assertRaises(TypeError, test_type) + def test_layer_norm(self): inp = np.ones([3, 32, 32], dtype='float32') with self.static_graph(): diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py index 39601eb0e12ff8d4debdda414c55fb43a9b41d79..2d4c8f61c0406d2da8a3ae54f197dfcca26a3f12 100644 --- a/python/paddle/fluid/tests/unittests/test_monitor.py +++ b/python/paddle/fluid/tests/unittests/test_monitor.py @@ -16,6 +16,7 @@ TestCases for Monitor """ from __future__ import print_function +import paddle import paddle.fluid as fluid import paddle.fluid.core as core import numpy as np @@ -51,7 +52,8 @@ class TestDatasetWithStat(unittest.TestCase): name=slot, shape=[1], dtype="int64", lod_level=1) slots_vars.append(var) - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.fleet.DatasetFactory().create_dataset( + "InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist([ diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py index fe31add697c65671eec12e8727499513129b1f05..dd1cf29eff9b7545121ac37908c4045dc924ceb0 100644 --- a/python/paddle/fluid/tests/unittests/test_pipeline.py +++ b/python/paddle/fluid/tests/unittests/test_pipeline.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers import numpy as np diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py index 19af0c92154e0fa1f631ef885588d640a338fe1b..b8258f3153a801dfc78db5f43325c0dce5c4b611 100644 --- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py @@ -17,6 +17,7 @@ no_check_set_white_list = [ 'fake_quantize_range_abs_max', 'coalesce_tensor', 'flatten2', + 'flatten_contiguous_range', 'lrn', 'squeeze2', 'reshape2', diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 478e05c8975d06d602253d692114f77ca25de0af..9f0089f68ab1efa3dc4bd0e89e87b3b18e44b24f 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -15,7 +15,10 @@ import sys import os -__all__ = ['TrainerDesc', 'MultiTrainer', 'DistMultiTrainer', 'PipelineTrainer'] +__all__ = [ + 'TrainerDesc', 'MultiTrainer', 'DistMultiTrainer', 'PipelineTrainer', + 'HeterXpuTrainer' +] class TrainerDesc(object): @@ -48,6 +51,43 @@ class TrainerDesc(object): self._program = None self._infer = False + def _set_heter_info(self, ret): + #ret = = fu.split_program_by_device(program) + #start_list, end_list, send_list, recv_list, program_list = fu.split_program_by_device(program) + #if len(start_list) != 3: + # print("start_list len=", len(start_list), " will not set heter info") + # return + #for i in start_list[0]: + # self.proto_desc.op_run_start_idx.append(i) + #for i in end_list[0]: + # self.proto_desc.op_run_end_idx.append(i) + #for i in send_list[0]: + # self.proto_desc.op_run_send_list.append(i) + #for i in recv_list[0]: + # self.proto_desc.op_run_recv_list.append(i) + if ret is None: + return + #for i in ret[0]: # start_list[1]: + # self.proto_desc.xpu_start_idx.append(i) + self.proto_desc.xpu_start_idx = ret[0] + + #for i in ret[1]: #end_list[1]: + # self.proto_desc.o_end_idx.append(i) + self.proto_desc.xpu_end_idx = ret[1] + for i in ret[2]: # send_list[1]: + self.proto_desc.xpu_send_list.append(i) + for i in ret[3]: # recv_list[1]: + self.proto_desc.xpu_recv_list.append(i) + + #for i in start_list[2]: + # self.proto_desc.op_run_end_start_idx.append(i) + #for i in end_list[2]: + # self.proto_desc.op_run_end_idx.append(i) + #for i in send_list[2]: + # self.proto_desc.op_run_end_send_list.append(i) + #for i in recv_list[2]: + # self.proto_desc.op_run_end_recv_list.append(i) + def _set_fetch_var_and_info(self, fetch_vars, fetch_info, print_period): # convert fetch_info to list fetch_info = list(fetch_info) @@ -122,6 +162,10 @@ class TrainerDesc(object): for param in dump_param: self.proto_desc.dump_param.append(param) + def _set_worker_places(self, worker_places): + for place in worker_places: + self.proto_desc.worker_places.append(place) + def _set_thread_barrier(self, thread_barrier): self.proto_desc.thread_barrier = thread_barrier @@ -272,6 +316,30 @@ class DistMultiTrainer(TrainerDesc): self._device_worker._gen_worker_desc(self.proto_desc) +class HeterXpuTrainer(TrainerDesc): + """ + Implement of HeterXpuTrainer. + It's for Distributed training. + """ + + def __init__(self): + super(HeterXpuTrainer, self).__init__() + pass + + def _set_program(self, program): + super(HeterXpuTrainer, self)._set_program(program) + self._program = program + + def _gen_trainer_desc(self): + super(HeterXpuTrainer, self)._gen_trainer_desc() + self.proto_desc.class_name = "HeterXpuTrainer" + if self._program == None: + raise RuntimeError("None Program") + self._device_worker._set_infer(self._infer) + self._device_worker._set_program(self._program) + self._device_worker._gen_worker_desc(self.proto_desc) + + class PipelineTrainer(TrainerDesc): """ Implement of PipelineTrainer. diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index c2d80f52b8db8dc9efbde079f93eca4bd5877cc2..f7573f6045dce2178d9c780df6717663f40a0871 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -22,7 +22,7 @@ from paddle.fluid.log_helper import get_logger local_logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') -from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer +from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer, HeterXpuTrainer from .device_worker import Hogwild, DownpourSGD, Section, DownpourSGDOPT from .framework import Variable from multiprocessing import Process, Manager @@ -75,6 +75,8 @@ class TrainerFactory(object): if opt_info.get("dump_param") is not None and len( opt_info.get("dump_param")) != 0: trainer._set_dump_param(opt_info["dump_param"]) + if opt_info.get("worker_places") is not None: + trainer._set_worker_places(opt_info["worker_places"]) if opt_info.get("enable_random_dump") is not None: trainer._set_enable_random_dump(opt_info[ "enable_random_dump"]) diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index e074ca66bb1d3700cc2e50db2b1439e991113f39..98948fa91e2e82ce0c566657114abfe79a5a7dc1 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -63,6 +63,7 @@ from .layer.common import Pool2D #DEFINE_ALIAS from .layer.common import Pad2D #DEFINE_ALIAS from .layer.common import Embedding #DEFINE_ALIAS from .layer.common import Linear #DEFINE_ALIAS +from .layer.common import Flatten #DEFINE_ALIAS from .layer.common import UpSample #DEFINE_ALIAS from .layer.conv import Conv2D #DEFINE_ALIAS from .layer.conv import Conv2DTranspose #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py index 4963ac360804f88dad9677e1dd9c05a5231c89b9..7173c5b587759f38a6c9b7172c02f326e09033a3 100644 --- a/python/paddle/nn/layer/__init__.py +++ b/python/paddle/nn/layer/__init__.py @@ -39,6 +39,7 @@ from .common import Pool2D #DEFINE_ALIAS from .common import Pad2D #DEFINE_ALIAS from .common import Embedding #DEFINE_ALIAS from .common import Linear #DEFINE_ALIAS +from .common import Flatten #DEFINE_ALIAS from .common import UpSample #DEFINE_ALIAS from .conv import Conv2D #DEFINE_ALIAS from .conv import Conv2DTranspose #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py index 8125e528b195b28024915ed9c20b922bd6224a5e..45259bea49d42eb07e0e593531a1680359f81a68 100644 --- a/python/paddle/nn/layer/common.py +++ b/python/paddle/nn/layer/common.py @@ -17,6 +17,7 @@ from ...fluid.dygraph import BilinearTensorProduct #DEFINE_ALIAS from ...fluid.dygraph import Pool2D #DEFINE_ALIAS from ...fluid.dygraph import Embedding #DEFINE_ALIAS from ...fluid.dygraph import Linear #DEFINE_ALIAS +from ...fluid.dygraph import Flatten #DEFINE_ALIAS from ...fluid.dygraph import layers from .. import functional as F diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index c2f67b4e13855b1a3e29e2bdd675dbf418b0a9a1..07d327a21ede6c40578ab91fd28a5f011f95aba3 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -25,7 +25,6 @@ import numpy as np from ..fluid.layers import cast #DEFINE_ALIAS from ..fluid.layers import expand #DEFINE_ALIAS from ..fluid.layers import expand_as #DEFINE_ALIAS -from ..fluid.layers import flatten #DEFINE_ALIAS from ..fluid.layers import reshape #DEFINE_ALIAS from ..fluid.layers import scatter #DEFINE_ALIAS from ..fluid.layers import slice #DEFINE_ALIAS @@ -169,6 +168,114 @@ def flip(x, axis, name=None): reverse = flip #DEFINE_ALIAS +def flatten(x, start_axis=0, stop_axis=-1, name=None): + """ + **Flatten op** + + Flattens a contiguous range of axes in a tensor according to start_axis and stop_axis. + + For Example: + + .. code-block:: text + + Case 1: + + Given + X.shape = (3, 100, 100, 4) + + and + start_axis = 1 + end_axis = 2 + + We get: + Out.shape = (3, 1000 * 100, 2) + + Case 2: + + Given + X.shape = (3, 100, 100, 4) + + and + start_axis = 0 + stop_axis = -1 + + We get: + Out.shape = (3 * 100 * 100 * 4) + + Args: + x (Variable): A tensor of number of dimentions >= axis. A tensor with data type float32, + float64, int8, int32, int64. + start_axis (int): the start axis to flatten + stop_axis (int): the stop axis to flatten + name(str, Optional): For details, please refer to :ref:`api_guide_Name`. + Generally, no setting is required. Default: None. + + Returns: + Variable: A tensor with the contents of the input tensor, with input \ + axes flattened by indicated start axis and end axis. \ + A Tensor with data type same as input x. + + Raises: + ValueError: If x is not a Variable. + ValueError: If start_axis or stop_axis is illegal. + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + paddle.enable_imperative() + + image_shape=(2, 3, 4, 4) + x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]).reshape(image_shape) / 100. + x = x.astype('float32') + + img = paddle.imperative.to_variable(x) + out = paddle.flatten(img, start_axis=1, stop_axis=2) + # out shape is [2, 12, 4] + """ + if not (isinstance(x, Variable)): + raise ValueError("The input x should be a Variable") + + check_variable_and_dtype( + x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64'], 'flatten') + helper = LayerHelper('flatten', **locals()) + + x_dim = len(x.shape) + if not (isinstance(start_axis, int)) or ( + start_axis > x_dim - 1) or start_axis < -x_dim: + raise ValueError( + "The start_axis should be a int, and in range [-rank(x), rank(x))") + if not (isinstance(stop_axis, int)) or ( + stop_axis > x_dim - 1) or stop_axis < -x_dim: + raise ValueError( + "The stop_axis should be a int, and in range [-rank(x), rank(x))") + if start_axis < 0: + start_axis = start_axis + x_dim + if stop_axis < 0: + stop_axis = stop_axis + x_dim + if start_axis > stop_axis: + raise ValueError("The stop_axis should be larger than stat_axis") + + if in_dygraph_mode(): + dy_out, _ = core.ops.flatten_contiguous_range( + x, 'start_axis', start_axis, 'stop_axis', stop_axis) + return dy_out + + out = helper.create_variable_for_type_inference(x.dtype) + x_shape = helper.create_variable_for_type_inference(x.dtype) + helper.append_op( + type='flatten_contiguous_range', + inputs={"X": x}, + outputs={'Out': out, + 'XShape': x_shape}, + attrs={"start_axis": start_axis, + "stop_axis": stop_axis}) + return out + + def roll(x, shifts, axis=None, name=None): """ :alias_main: paddle.roll @@ -252,13 +359,18 @@ def roll(x, shifts, axis=None, name=None): return out -def stack(x, axis=0, out=None, name=None): +def stack(x, axis=0, name=None): """ :alias_main: paddle.stack - :alias: paddle.stack,paddle.tensor.stack,paddle.tensor.manipulation.stack + :alias: paddle.stack, paddle.tensor.stack, paddle.tensor.manipulation.stack - - This OP stacks all the inputs :code:`x` along axis. + This OP stacks all the input tensors ``x`` along ``axis`` dimemsion. + All tensors must be of the same shape and same dtype. + + For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked + tensor is [N, A, B]; if ``axis == 1``, the shape of stacked + tensor is [A, N, B], etc. + .. code-block:: text @@ -284,7 +396,6 @@ def stack(x, axis=0, out=None, name=None): Case 2: - Input: x[0].shape = [1, 2] x[0].data = [ [1.0 , 2.0 ] ] @@ -295,7 +406,7 @@ def stack(x, axis=0, out=None, name=None): Attrs: - axis = 1 or axis = -2 + axis = 1 or axis = -2 # If axis = -2, axis = axis+ndim(x[0])+1 = -2+2+1 = 1. Output: Out.shape = [1, 3, 2] @@ -304,65 +415,40 @@ def stack(x, axis=0, out=None, name=None): [5.0, 6.0] ] ] Args: - x (Variable|list(Variable)): Input :code:`x` can be a single Tensor, a :code:`list` of Tensors. - If :code:`x` is a :code:`list`, the shapes of all these Tensors - must be the same. Supposing input is N dims - Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims - Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`. - Support data types: float32, float64, int32, int64. - axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`. - R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`. - The default value of axis is 0. - + x (Tensor|list[Tensor]): Input ``x`` can be a single tensor, or a ``list`` of tensors. + If ``x`` is a ``list``, the Tensors in ``x`` + must be of the same shape and dtype. Support data types: float32, float64, int32, int64. + axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``, + where ``R`` is the number of dimensions of the first input tensor ``x[0]``. + If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0. + name (str, optional): Please refer to :ref:`api_guide_Name`, Default None. + Returns: - Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`. + Tensor: The stacked tensor with same data type as input. Example: .. code-block:: python - import numpy as np + import paddle - import paddle.fluid as fluid + import numpy as np data1 = np.array([[1.0, 2.0]]) data2 = np.array([[3.0, 4.0]]) data3 = np.array([[5.0, 6.0]]) - with fluid.dygraph.guard(): - x1 = fluid.dygraph.to_variable(data1) - x2 = fluid.dygraph.to_variable(data2) - x3 = fluid.dygraph.to_variable(data3) - result = paddle.stack([x1, x2, x3], axis=0) - # result shape: [3, 1, 2] - # result value: [[[1.0, 2.0]], - # [[3.0, 4.0]], - # [[5.0, 6.0]]] - """ - - helper = LayerHelper('stack', **locals()) - axis = 0 if axis is None else axis - - if not isinstance(x, list) and not isinstance(x, tuple): - x = [x] - out = helper.create_variable_for_type_inference(x[0].dtype) - if not in_dygraph_mode() and \ - x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY: - assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \ - "number of the elements must be 1, but received %s." % len(x) - out_index = helper.create_variable_for_type_inference(dtype="int32") - helper.append_op( - type='tensor_array_to_tensor', - inputs={'X': x[0]}, - outputs={'Out': [out], - 'OutIndex': [out_index]}, - attrs={'axis': axis, - 'use_stack': True}) - else: - helper.append_op( - type='stack', - inputs={'X': x}, - outputs={'Y': out}, - attrs={'axis': axis}) - return out + paddle.enable_imperative() + x1 = paddle.imperative.to_variable(data1) + x2 = paddle.imperative.to_variable(data2) + x3 = paddle.imperative.to_variable(data3) + + out = paddle.stack([x1, x2, x3], axis=0) + print(out.shape) # [3, 1, 2] + print(out.numpy()) + # [[[1., 2.]], + # [[3., 4.]], + # [[5., 6.]]] + """ + return layers.stack(x, axis, name) def split(x, num_or_sections, axis=0, name=None): diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py index 33537929044dbaa6c86ddce3dd972c02603eb0aa..85d0e133fa406df414977c0d69e0537ab5833891 100644 --- a/python/paddle/utils/__init__.py +++ b/python/paddle/utils/__init__.py @@ -16,8 +16,9 @@ from .plot import Ploter from .profiler import ProfilerOptions from .profiler import Profiler from .profiler import get_profiler +from .deprecated import deprecated -__all__ = ['dump_config', 'Ploter'] +__all__ = ['dump_config', 'Ploter', 'deprecated'] #TODO: define new api under this directory # __all__ = ['unique_name', diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py new file mode 100644 index 0000000000000000000000000000000000000000..27621c2d872a6d10ec3259312abe318fef5b334b --- /dev/null +++ b/python/paddle/utils/deprecated.py @@ -0,0 +1,82 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +decorator to deprecate a function or class +""" + +import warnings +import functools +import paddle + + +def deprecated(update_to="", since="", reason=""): + """Decorate a function to signify its deprecation. + + This function wraps a method that will soon be removed and does two things: + - The docstring of the API will be modified to include a notice + about deprecation." + - Raises a :class:`~exceptions.DeprecatedWarning` when old API is called. + Args: + since(str): The version at which the decorated method is considered deprecated. + update_to(str): The new API users should use. + reason(str): The reason why the API is deprecated. + Returns: + decorator: decorated function or class. + """ + + def decorator(func): + """construct warning message, and return a decorated function or class.""" + assert isinstance(update_to, str), 'type of "update_to" must be str.' + assert isinstance(since, str), 'type of "since" must be str.' + assert isinstance(reason, str), 'type of "reason" must be str.' + + _since = since.strip() + _update_to = update_to.strip() + _reason = reason.strip() + + msg = 'API "{}.{}" is deprecated'.format(func.__module__, func.__name__) + if len(_since) > 0: + msg += " since {}".format(_since) + msg += ", and may be removed in future versions." + if len(_update_to) > 0: + assert _update_to.startswith( + "paddle." + ), 'Argument update_to must start with "paddle.", your value is "{}"'.format( + update_to) + msg += ' Use "{}" instead.'.format(_update_to) + if len(_reason) > 0: + msg += "\n reason: {}".format(_reason) + + @functools.wraps(func) + def wrapper(*args, **kwargs): + """deprecated warning should be fired in 3 circumstances: + 1. current version is develop version, i.e. "0.0.0", because we assume develop version is always the latest version. + 2. since version is empty, in this case, API is deprecated in all versions. + 3. current version is newer than since version. + """ + v_current = [int(i) for i in paddle.__version__.split(".")] + v_current += [0] * (4 - len(v_current)) + v_since = [int(i) for i in _since.split(".")] + v_since += [0] * (4 - len(v_since)) + if paddle.__version__ == "0.0.0" or _since == "" or v_current >= v_since: + warnings.simplefilter('always', + DeprecationWarning) # turn off filter + warnings.warn(msg, category=DeprecationWarning, stacklevel=2) + warnings.simplefilter('default', + DeprecationWarning) # reset filter + return func(*args, **kwargs) + + return wrapper + + return decorator