diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto deleted file mode 100644 index d0595ef1089a8792048dcd26c1dd12d8f5817905..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distributed/send_recv.proto +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under -the Apache License, Version 2.0 (the "License"); you may not use this file -except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -syntax = "proto3"; -package sendrecv; - -option cc_generic_services = false; - -service SendRecvService { - // For parameter server round-robin like hashing, do not split tensors. - // Send and recv only one tensor - // TODO(typhoonzero): add streaming API - rpc SendVariable(VariableMessage) returns (VoidMessage) {} - // Argument VariableMessage for GetVariable should only contain varname. - rpc GetVariable(VariableMessage) returns (VariableMessage) {} - // pre-fetch variable by given variable name and Ids - rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} - - rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} -} - -// VariableMessage is serialized paddle variable message. -// It can be: -// LoDTensor -// SelectedRows -enum VarType { - LOD_TENSOR = 0; - SELECTED_ROWS = 1; - NCCL_ID = 2; -} - -// NOTICE(gongwb):don't modify this proto if you are not -// not familar with how we serialize in sendrecvop_utils.h -// and deserilize it in variable_response.h. -message VariableMessage { - enum Type { - // Pod Types - BOOL = 0; - INT16 = 1; - INT32 = 2; - INT64 = 3; - FP16 = 4; - FP32 = 5; - FP64 = 6; - } - - message LodData { repeated int64 lod_data = 1; } - string varname = 1; - // TODO(Yancey1989): reference framework::proto::VarDesc::VarType - VarType type = 2; - // bool persistable is not needed for sending. - // tensor info: - Type data_type = 3; - repeated int64 dims = 4; - - // lod details: - int64 lod_level = 5; - repeated LodData lod = 6; - // selected_rows height, aka. original dim0 - int64 slr_height = 7; - // tensor data - bytes serialized = 8; - // selected_rows data - bytes rows = 9; - // Look up table block execution output variable name. - string out_varname = 10; - // If 1, the ps server will start profiling, the ps - // server stops profiling and generates a profile to /tmp/profile_ps_* - // when profile switches from 1 to 2. - int64 profile = 11; -} - -message VoidMessage {} diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index c5e69e41be93f0f00e9472669714be91d7d7fb0f..fcf86cc5839113b75855ce97459b2ee4881238cd 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -71,7 +71,6 @@ class TestParallelExecutorBase(unittest.TestCase): exec_strategy.allow_op_delay = allow_op_delay build_strategy = fluid.BuildStrategy() - build_strategy.debug_graphviz_path = "/tmp/graphviz" build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index d740eb5443f2521361b40eac8bc5e315f8f371de..76389d916fc39f470a22aed4792bf7b754600436 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -152,6 +152,16 @@ class TestMNIST(TestParallelExecutorBase): use_cuda=use_cuda, use_reduce=use_reduce) + def test_simple_fc(self): + # use_cuda + self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence(False) + + def test_simple_fc_with_new_strategy(self): + # use_cuda, use_reduce + self._compare_reduce_and_allreduce(simple_fc_net, True) + self._compare_reduce_and_allreduce(simple_fc_net, False) + def check_simple_fc_parallel_accuracy(self, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return @@ -178,6 +188,10 @@ class TestMNIST(TestParallelExecutorBase): for p_l in parallel_last_loss: self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) + def test_simple_fc_parallel_accuracy(self): + self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy(False) + def check_batchnorm_fc_convergence(self, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return @@ -192,31 +206,13 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda) - def check_batchnorm_fc_convergence_use_reduce(self, use_cuda): - if use_cuda and not core.is_compiled_with_cuda(): - return - self.check_network_convergence( - fc_with_batchnorm, use_cuda=use_cuda, use_reduce=False) - """ - img, label = self._init_data() - - all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( - fc_with_batchnorm, - feed_dict={"image": img, - "label": label}, - use_cuda=use_cuda, - use_reduce=False) - reduce_first_loss, reduce_last_loss = self.check_network_convergence( - fc_with_batchnorm, - feed_dict={"image": img, - "label": label}, - use_cuda=use_cuda, - use_reduce=True) - """ + def test_batchnorm_fc(self): + self.check_batchnorm_fc_convergence(True) + self.check_batchnorm_fc_convergence(False) def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence_use_reduce(True) - # self.check_batchnorm_fc_convergence_use_reduce(False) + self._compare_reduce_and_allreduce(fc_with_batchnorm, True) + self._compare_reduce_and_allreduce(fc_with_batchnorm, False) if __name__ == '__main__':