diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 4d21fce5b2525e23441df0b0d21b96bb0797d1bb..575027893496e96b6b070d0cd2bd1d17abf20c7e 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -51,7 +51,8 @@ endif() cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL) cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) -cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS enforce simple_threadpool) +cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool) +cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder) cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc new file mode 100644 index 0000000000000000000000000000000000000000..3f3b6b959e30194c10b1a58d6fc3e7a61ad01313 --- /dev/null +++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" + +namespace paddle { +namespace operators { +namespace distributed { + +std::once_flag AsyncSparseParamUpdateRecorder::init_flag_; +std::unique_ptr + AsyncSparseParamUpdateRecorder::recorder_(nullptr); + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h index 4b071f6706779221864962458a1f6ab5648bc845..6250cd501286e5f668c008eb13342f215974b735 100644 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h +++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h @@ -67,10 +67,9 @@ class AsyncSparseParamUpdateRecorder { int trainer_num, const std::unordered_map& grad_to_param) : trainer_num_(trainer_num), grad_to_param_(grad_to_param) { - for (auto iter = grad_to_param.begin(); iter != grad_to_param.end(); - iter++) { - param_to_grad_[iter->second] = iter->first; - auto& param_name = iter->second; + for (auto& iter : grad_to_param) { + param_to_grad_[iter.second] = iter.first; + auto& param_name = iter.second; param_to_updated_rows_[param_name] = TrainerToRows(); auto& trainer_to_rows = param_to_updated_rows_[param_name]; for (auto i = 0; i < trainer_num; ++i) { @@ -104,11 +103,41 @@ class AsyncSparseParamUpdateRecorder { return param_to_grad_.find(param_name) != param_to_grad_.end(); } + bool HasGrad(const std::string& grad_name) { + return grad_to_param_.find(grad_name) != grad_to_param_.end(); + } + private: const int trainer_num_; std::unordered_map grad_to_param_; std::unordered_map param_to_grad_; std::unordered_map param_to_updated_rows_; + + // init recorder + public: + static void Init( + int trainer_num, + const std::unordered_map& grad_to_param) { + InitImpl(trainer_num, grad_to_param); + } + + static AsyncSparseParamUpdateRecorder* GetInstance() { + return recorder_.get(); + } + + private: + // Init is called by GetInstance. + static void InitImpl( + int trainer_num, + const std::unordered_map& grad_to_param) { + if (recorder_ == nullptr) { + recorder_.reset( + new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param)); + } + } + + static std::once_flag init_flag_; + static std::unique_ptr recorder_; }; } // namespace distributed diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc index af29230bad2eba7a3210de6db404cc96633162f4..67e8fd8a0edc4510d0abe885c821e75b528254f8 100644 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc +++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc @@ -73,6 +73,10 @@ TEST(AsyncSparseParamUpdateRecorder, All) { EXPECT_TRUE(recorder.HasParam("param2")); EXPECT_FALSE(recorder.HasParam("param3")); + EXPECT_TRUE(recorder.HasGrad("grad1")); + EXPECT_TRUE(recorder.HasGrad("grad2")); + EXPECT_FALSE(recorder.HasGrad("grad3")); + std::vector ret; EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret)); diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 991158ac72007efc1233f852caed4f90f35fe1cd..12ff08fc8af115f6d4e99281334d4a542c37a8d1 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -180,6 +180,10 @@ class RequestHandler { grad_to_prepared_ctx_ = g; } + void SetSparseGradToParam(std::unordered_map* g) { + sparse_grad_to_param_ = g; + } + void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; } // Get attributes. @@ -228,6 +232,7 @@ class RequestHandler { std::unordered_map>* grad_to_prepared_ctx_; + std::unordered_map* sparse_grad_to_param_; RPCServer* rpc_server_; }; diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index a1ef1af39ff2ab1456706ebafbd3d7ce1acc0c07..1096f3773c6d44560d370502b1c550d67d40ca64 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -2,9 +2,9 @@ include(operators) set(DISTRIBUTE_DEPS "") if(WITH_GRPC) - set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) else() - set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator brpc leveldb snappystream snappy protobuf ssl crypto zlib node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb snappystream snappy protobuf ssl crypto zlib node) if(WITH_BRPC_RDMA) find_library(IBVERBS_LIBRARY NAMES ibverbs) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index 5b30ed472d51a37a0705d1717395da9e4ff7d743..a672fb2a9141a81383d947dcc961a112aee3f7ac 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -24,8 +24,10 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h" + #include "paddle/fluid/platform/profiler.h" DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send"); @@ -292,6 +294,8 @@ static void FillRequestCtx( std::unordered_map> *prefetch_ctx, + std::unordered_map + *sparse_grad_name_to_param_name, std::shared_ptr checkpoint_ctx, distributed::RPCServer *rpc_server) { h->SetScope(scope); @@ -299,6 +303,7 @@ static void FillRequestCtx( h->SetExecutor(executor); h->SetProgram(program); h->SetPrefetchPreparedCtx(prefetch_ctx); + h->SetSparseGradToParam(sparse_grad_name_to_param_name); h->SetRPCServer(rpc_server); h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx); } @@ -414,10 +419,24 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i]; } - auto f = - std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, - &executor, program, &prefetch_var_name_to_prepared_ctx, - ckpt_pre_context, rpc_service_.get()); + // parse attr of kSparseGradToParam sparse_grad_name -> param_name + std::unordered_map sparse_grad_name_to_param_name; + auto sparse_grad_name_to_param_name_str = + Attr>(kSparseGradToParam); + for (const auto &sparse_grad_name_and_param_name : + sparse_grad_name_to_param_name_str) { + std::vector pieces; + split(sparse_grad_name_and_param_name, ':', &pieces); + PADDLE_ENFORCE_EQ(pieces.size(), 2); + VLOG(3) << "after split, sparse_grad_name = " << pieces[0] + << ", param_name = " << pieces[1]; + sparse_grad_name_to_param_name[pieces[0]] = pieces[1]; + } + + auto f = std::bind( + FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, &executor, + program, &prefetch_var_name_to_prepared_ctx, + &sparse_grad_name_to_param_name, ckpt_pre_context, rpc_service_.get()); f(request_send_handler_.get()); f(request_get_handler_.get()); @@ -445,6 +464,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, RunSyncLoop(&executor, program, &recv_scope, &dev_ctx, prefetch_block_id_list, checkpoint_block_id); } else { + distributed::AsyncSparseParamUpdateRecorder::Init( + fan_in, sparse_grad_name_to_param_name); RunAsyncLoop(&executor, program, &recv_scope); } } @@ -475,6 +496,10 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>(kPrefetchVarNameToBlockId, "prefetch blocks to run on server side.") .SetDefault({}); + AddAttr>( + kSparseGradToParam, + "sparse grad name to param name. like: 'emb@Grad:emb'") + .SetDefault({}); AddAttr("Fanin", "How many clients send to this server.") .SetDefault(1); AddAttr(kCheckpointBlockId, diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h index f20442bad7c5bd96173b9d6efc4dceb13feacf5b..bff2763e987a43bc4dab0b391c8ffae87758bca1 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h @@ -35,6 +35,7 @@ namespace operators { constexpr char kOptimizeBlocks[] = "optimize_blocks"; constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; constexpr char kCheckpointBlockId[] = "checkpint_block_id"; +constexpr char kSparseGradToParam[] = "sparse_grad_to_param"; void RunServer(std::shared_ptr service); diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 4ddfc084e06b1dc1b018b479304d1e9c1871de19..0c3290cdccc36206a63fc796e6f343685da1984a 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -791,11 +791,15 @@ class DistributeTranspiler(object): global_ops = [] + # sparse grad name to param name + sparse_grad_to_param = [] + def __append_optimize_op__(op, block, grad_to_block_id, merged_var, lr_ops): if self._is_optimizer_op(op): self._append_pserver_ops(block, op, endpoint, grad_to_block_id, - self.origin_program, merged_var) + self.origin_program, merged_var, + sparse_grad_to_param) elif op not in lr_ops: self._append_pserver_non_opt_ops(block, op) @@ -911,6 +915,7 @@ class DistributeTranspiler(object): "Fanin": self.trainer_num, "sync_mode": self.sync_mode, "grad_to_block_id": grad_to_block_id, + "sparse_grad_to_param": sparse_grad_to_param, } if self.has_distributed_lookup_table: @@ -1778,7 +1783,8 @@ class DistributeTranspiler(object): return o4 def _append_pserver_ops(self, optimize_block, opt_op, endpoint, - grad_to_block_id, origin_program, merged_var): + grad_to_block_id, origin_program, merged_var, + sparse_grad_to_param): program = optimize_block.program pserver_block = program.global_block() new_inputs = collections.OrderedDict() @@ -1862,6 +1868,12 @@ class DistributeTranspiler(object): outputs=outputs, attrs=opt_op.all_attrs()) + # record sparse grad to param name + if new_inputs["Grad"].type == core.VarDesc.VarType.SELECTED_ROWS: + sparse_grad_to_param.append( + str(new_inputs["Grad"].name) + ":" + str(new_inputs["Param"] + .name)) + def _get_pserver_grad_param_var(self, var, var_dict): """ Return pserver side grad/param variable, return None