diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 2d38663df9b06910145f461a8e5a9ad2cc7d1a84..256aded8ca234a24229e11f27b9e3e25728ad293 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -194,8 +194,6 @@ if(WITH_DISTRIBUTE) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(async_listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(async_listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS}) diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index 059099ee0ab840ba7ea9d7686f27c5da1ffbb14b..719a7465b8d58ef8588ff1e83c2b971eb6fbb00f 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -1,6 +1,6 @@ if(WITH_DISTRIBUTE) grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc - grpc_server.cc async_grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) + grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index 349427525df65eb355c2403e83f7b769f527512a..73b7c85663d082251c0c01d3efb355c0aadbc5c0 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -143,7 +143,8 @@ class DistributeTranspiler: program=None, pservers="127.0.0.1:6174", trainers=1, - split_method=splitter.round_robin): + split_method=splitter.round_robin, + sync_mode=True): """ Transpile the program to distributed data-parallelism programs. The main_program will be transformed to use a remote parameter server @@ -191,6 +192,7 @@ class DistributeTranspiler: self.origin_program = program self.trainer_num = trainers self.optimize_ops = optimize_ops + self.sync_mode = sync_mode # TODO(typhoonzero): currently trainer_id is fetched from cluster system # like Kubernetes, we should port this to use etcd later when developing # fluid distributed training with fault-tolerance. @@ -473,7 +475,9 @@ class DistributeTranspiler: "OptimizeBlock": optimize_block, "endpoint": endpoint, "Fanin": self.trainer_num, - "PrefetchBlock": prefetch_block + "PrefetchBlock": prefetch_block, + "sync_mode": self.sync_mode, + "grad_to_id": [] }) pserver_program.sync_with_cpp()