diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc index b7d6edd389d8e40835dadf56d7c54d53402f6f4d..2e20c436dfdb61fcda78cd044b86848c750cf22c 100644 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc @@ -30,8 +30,6 @@ namespace paddle { namespace framework { namespace details { -static constexpr char kAllOpDescs[] = "all_op_descs"; - VarHandle* GetValidInput(const OpHandleBase* a) { for (auto p : a->Inputs()) { VarHandle* b = dynamic_cast(p); diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 0c823b9ca2a135101f7776bf0a8f94f4a93ffb03..8c6c9f35e84f4fd7d2b5486ac0eb60beceb512a2 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -34,9 +34,11 @@ namespace details { static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { // Should fix the allreduce op order if scheduling // them in multiple threads or processes to avoid hang. + // NOTE: ParallelGraph would execute this pass on each graph, so + // don't need to append it here. return (!strategy.enable_sequential_execution_ && - strategy.num_trainers_ > 1) || - strategy.enable_parallel_graph_; + strategy.num_trainers_ > 1) && + !strategy.enable_parallel_graph_; } class ParallelExecutorPassBuilder : public ir::PassBuilder { diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h index dba96309fdf37e6ca73780cb194843659e1bd2d1..377367faf3c529496b00004f23159750cc2e4bc4 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.h +++ b/paddle/fluid/framework/details/memory_optimize_helper.h @@ -29,8 +29,6 @@ namespace paddle { namespace framework { namespace details { -constexpr char kAllOpDescs[] = "all_op_descs"; - std::vector SortOpLikeDescOrder(const ir::Graph& graph); // NOTE(dzh): A ordered set for node reuse in memory optimize. diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 75f922d2cca6855a67be7284ae407e549a1a1afb..7d1e63f3682bca8965f6c5e695132dff44fa3715 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -392,20 +392,32 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( ir::Graph *result, const std::string &og) const { + OpHandleBase *op_handle = nullptr; + + auto append_allreduce_op = [&]( + const std::vector &scopes, + const std::vector &places) -> OpHandleBase * { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( - result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), - local_scopes_, places_, nccl_ctxs_)); + result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( + result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), + scopes, places, nccl_ctxs_)); #else - result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( - result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), - local_scopes_, places_)); + result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( + result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), + scopes, places)); #endif - auto *op_handle = result->Get(kGraphOps).back(); + return result->Get(kGraphOps).back(); + }; + + if (!strategy_.enable_parallel_graph_) + op_handle = append_allreduce_op(local_scopes_, places_); for (size_t i = 0; i < places_.size(); ++i) { - auto &p = places_[i]; - SetCommunicationContext(op_handle, p); + if (strategy_.enable_parallel_graph_) { + op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]}); + } + + SetCommunicationContext(op_handle, places_[i]); auto &vars = result->Get(kGraphVars)[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); @@ -413,7 +425,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( auto var = new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), - vars.size(), i, og, p); + vars.size(), i, og, places_[i]); vars.emplace_back(var); op_handle->AddOutput(var); } diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index 1a2b75fbc0c28984ce5cf00e0a2ce0f804349bb1..9afbb91005c9c3a9d2e185f4dfa901ebf812ee19 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -36,13 +36,14 @@ namespace details { // map from variable name to variables. The variables, who have the same name, // will have a differsent version. The offset in the // `std::vector` is the version of varaibles. -typedef std::vector>> +typedef std::vector>> GraphVars; const char kGraphVars[] = "vars"; // aux variables to represent dependency. Useful to resolve data hazard. -typedef std::unordered_set GraphDepVars; +typedef std::unordered_set GraphDepVars; const char kGraphDepVars[] = "dep_vars"; + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index b1a82e8771b92f2d0af4a1c7732ff2da54d496a8..e0aa352e95bc3685a1f4879bffa6e86eecd7e7f9 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -70,6 +70,9 @@ class OpHandleBase { auto it = dev_ctxes_.find(place); return it != dev_ctxes_.end() ? it->second : nullptr; } + const std::map &DeviceContext() { + return dev_ctxes_; + } void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) { dev_ctxes_[place] = ctx_; diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index e8deb5bfc6c013addce11e2a04286a828acc1930..4c8f69c68ce17d0143c34e8adbab92cdc90058c8 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -13,22 +13,92 @@ // limitations under the License. #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" +#include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { namespace details { +std::vector> +ParallelSSAGraphExecutor::SeparateMultiDevicesGraph( + std::unique_ptr &&graph) { + std::vector> graphs; + graphs.reserve(places_.size()); + for (size_t i = 0; i < places_.size(); ++i) { + ProgramDesc empty; + graphs.emplace_back(std::unique_ptr(new ir::Graph(empty))); + auto &g = graphs.back(); + g->Set(kGraphVars, new GraphVars(1UL)); + g->Set(kGraphDepVars, new GraphDepVars); + } + auto op_handles = ir::FilterByNodeWrapper(*graph); + + for (auto &op : op_handles) { + auto &dev_ctx = op->DeviceContext(); + auto &p = dev_ctx.begin()->first; + int dev_id = boost::get(p).device; + auto &dev_dummys = graphs[dev_id]->Get(kGraphDepVars); + graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release()); + + for (auto &var : op->Inputs()) { + auto dummy_ptr = dynamic_cast(var); + if (dummy_ptr) { + dev_dummys.insert(var); + if (graph->Nodes().count(var->Node())) + graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release()); + } + } + for (auto &var : op->Outputs()) { + auto dummy_ptr = dynamic_cast(var); + if (dummy_ptr) { + dev_dummys.insert(var); + if (graph->Nodes().count(var->Node())) + graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release()); + } + } + } + + for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) { + auto &dev_vars = graphs[dev_id]->Get(kGraphVars)[0]; + auto &origin_vars = graph->Get(kGraphVars)[dev_id]; + for (auto &name_pair : origin_vars) { + dev_vars.emplace(name_pair.first, name_pair.second); + for (auto &version_pair : name_pair.second) { + if (graph->Nodes().count(version_pair->Node())) { + graphs[dev_id]->AddNode( + graph->RemoveNode(version_pair->Node()).release()); + } + } + } + } + + return graphs; +} + ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::vector> &&graphs) + const framework::ProgramDesc &main_prog, std::unique_ptr &&graph) : strategy_(std::move(strategy)), local_scopes_(std::move(local_scopes)), pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), places_(std::move(places)), - graphs_(std::move(graphs)) { + main_prog_(main_prog), + // TODO(Yancey1989): Copying graphs is not safely since it deleted the + // attrs. + graphs_(SeparateMultiDevicesGraph(std::move(graph))) { PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + auto seq_allreduce_pass = + ir::PassRegistry::Instance().Get("all_reduce_deps_pass"); + seq_allreduce_pass->Erase(details::kAllOpDescs); + seq_allreduce_pass->Set>( + details::kAllOpDescs, + new std::vector(main_prog_.Block(0).AllOps())); + for (size_t i = 0; i < graphs_.size(); ++i) { + graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i])); + } + // set the correct size of thread pool to each device. strategy_.num_threads_ = strategy_.num_threads_ < places_.size() ? 1UL @@ -37,7 +107,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( << " to run the operators of the graph on each device."; for (size_t i = 0; i < places.size(); ++i) { executors_.emplace_back(new details::ThreadedSSAGraphExecutor( - strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); + strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i)))); } } diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h index c00c5bc2d1b4b78593f99c819b5a3d642150e773..1c35d45fdd356a867d1ad80b345379395e03172e 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -18,7 +18,9 @@ #include #include "ThreadPool.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" +#include "paddle/fluid/framework/ir/graph.h" namespace paddle { namespace framework { @@ -29,17 +31,23 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::vector> &&graphs); + const framework::ProgramDesc &main_prog, + std::unique_ptr &&graph); ~ParallelSSAGraphExecutor() final = default; + const ir::Graph &Graph() const override { return *graphs_[0]; } FeedFetchList Run(const std::vector &fetch_tensors) override; private: + std::vector> SeparateMultiDevicesGraph( + std::unique_ptr &&graph); + ExecutionStrategy strategy_; std::vector local_scopes_; std::unique_ptr<::ThreadPool> pool_{nullptr}; std::vector places_; + framework::ProgramDesc main_prog_; std::vector> graphs_; std::vector> executors_; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 50bab832c2c1cc04dcc8be766aeb8a68ffd290d6..72acc337b7cc4803fa010373f8817ff5fd25cb2c 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -219,7 +219,7 @@ void ThreadedSSAGraphExecutor::RunOp( VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); - VLOG(10) << op << " " << op->Name() << "Signal posted"; + VLOG(10) << op << " " << op->Name() << " Signal posted"; } catch (...) { exception_holder_.Catch(std::current_exception()); } diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index feb3330176490e71c51cce826fe49d5499469ad7..296f3b83961c1379ee2c1237aa15784791b46878 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -26,6 +26,14 @@ limitations under the License. */ namespace paddle { namespace framework { + +namespace details { + +// This attr is not recommended, because the graph should not dependence +// the program once it is built. +constexpr char kAllOpDescs[] = "all_op_descs"; +} // namespace details + namespace ir { /* @@ -168,10 +176,13 @@ class Graph { return ret; } - void RemoveNode(ir::Node *node) { + std::unique_ptr RemoveNode(ir::Node *node) { PADDLE_ENFORCE(node_set_.find(node) != node_set_.end()); - node_set_.erase(node); + std::unique_ptr ret; + ret.reset(nodes_.at(node).release()); nodes_.erase(node); + node_set_.erase(node); + return ret; } // NOTE low performance, but simple and secure. @@ -184,13 +195,6 @@ class Graph { return nullptr; } - void ResolveHazard( - const std::map> &var_nodes); - - private: - std::map> InitFromProgram( - const ProgramDesc &program); - // This method takes ownership of `node`. ir::Node *AddNode(ir::Node *node) { PADDLE_ENFORCE(node_set_.find(node) == node_set_.end()); @@ -199,6 +203,13 @@ class Graph { return node; } + void ResolveHazard( + const std::map> &var_nodes); + + private: + std::map> InitFromProgram( + const ProgramDesc &program); + // NOTE: program_ shouldn't be exposed to user. const ProgramDesc program_; std::map attrs_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ff7ef0cce2f12fe89dd087c4a5006b2cfdc5a4a9..56da5660095affa0ba49d8bc533d1da01ffd18be 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/details/all_reduce_deps_pass.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" @@ -193,7 +194,6 @@ ParallelExecutor::ParallelExecutor( member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; member_->nranks_ = build_strategy.num_trainers_ * places.size(); - if (!member_->use_all_reduce_) { PADDLE_ENFORCE(places.size() > 1, "If you set build_strategy.reduce with 'Reduce'," @@ -221,9 +221,10 @@ ParallelExecutor::ParallelExecutor( // choice the execution strategy. build_strategy.enable_parallel_graph_ = EnableParallelGraphExecution(main_program, exec_strategy, build_strategy); - - VLOG(1) << "Enable ParallelGraph Execution: " - << build_strategy.enable_parallel_graph_; + if (build_strategy.enable_parallel_graph_) + VLOG(0) << "The Executor would execute the graph by ParallelGraph " + "Execution which can get better performance," + << "you can force it off by env FLAGS_enable_parallel_graph=0"; if (member_->use_cuda_) { // Bcast Parameters to all GPUs @@ -257,60 +258,44 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp - std::vector> graphs; + std::unique_ptr graph; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - if (build_strategy.enable_parallel_graph_) { - for (size_t i = 0; i < member_->places_.size(); ++i) { - std::unique_ptr graph = build_strategy.Apply( - main_program, {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_, - member_->nccl_ctxs_.get()); - graphs.push_back(std::move(graph)); - } - } else { - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get()); - graphs.push_back(std::move(graph)); - } + graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, + member_->use_cuda_, member_->nccl_ctxs_.get()); #else - std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_cuda_); - graphs.push_back(std::move(graph)); + graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, + member_->use_cuda_); #endif auto max_memory_size = GetEagerDeletionThreshold(); VLOG(10) << "Eager Deletion Threshold " << static_cast(max_memory_size) / (1 << 30); if (max_memory_size >= 0) { - for (size_t i = 0; i < graphs.size(); ++i) { - graphs[i] = member_->PrepareGCAndRefCnts( - std::move(graphs[i]), static_cast(max_memory_size)); - } + graph = member_->PrepareGCAndRefCnts(std::move(graph), + static_cast(max_memory_size)); } // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; - for (auto &graph : graphs) { - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - } + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); } } // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { - size_t graph_num = ir::GraphNum(*graphs[0]); + size_t graph_num = ir::GraphNum(*graph); if (graph_num > 1) { LOG(WARNING) << "The number of graph should be only one, " "but the current graph has " - << ir::GraphNum(*graphs[0]) + << ir::GraphNum(*graph) << " sub_graphs. If you want to see the nodes of the " "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " "to specify the output dir. NOTES: if you not do training, " @@ -319,18 +304,25 @@ ParallelExecutor::ParallelExecutor( } if (build_strategy.enable_parallel_graph_) { +#ifdef PADDLE_WITH_CUDA + // TODO(Yancey1989): Remove passing in the main_program when + // allreduce_seq_pass doesn't need it as the attr. member_->executor_.reset(new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs))); + exec_strategy, member_->local_scopes_, member_->places_, main_program, + std::move(graph))); +#else + PADDLE_THROW( + "Paddle should be compiled with CUDA for ParallelGraph Execution."); +#endif } else { if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); + std::move(graph))); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, - std::move(graphs[0]))); + std::move(graph))); } } @@ -482,11 +474,10 @@ bool ParallelExecutor::EnableParallelGraphExecution( } if (!member_->use_all_reduce_ || !member_->use_cuda_) - enable_parallel_graph = false; - if (build_strategy.enable_sequential_execution_ || - exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) - enable_parallel_graph = false; + if (build_strategy.enable_sequential_execution_ || + exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) + enable_parallel_graph = false; return enable_parallel_graph; } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py new file mode 100644 index 0000000000000000000000000000000000000000..041c56fce11e6f6abb0a941a9e9c9ad1cb60ab42 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py @@ -0,0 +1,107 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import numpy as np +import os +os.environ['FLAGS_enable_parallel_graph'] = str(1) +import paddle.fluid.core as core +import os +import paddle.fluid as fluid +from parallel_executor_test_base import TestParallelExecutorBase + + +def simple_fc_net(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = img + for _ in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestMNIST(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + + def _init_data(self): + np.random.seed(5) + img = np.random.random(size=[32, 784]).astype(np.float32) + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + # simple_fc + def check_simple_fc_convergence(self, use_cuda, use_reduce=False): + if use_cuda and not core.is_compiled_with_cuda(): + return + + img, label = self._init_data() + + self.check_network_convergence( + simple_fc_net, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + use_reduce=use_reduce) + + def test_simple_fc(self): + # use_cuda + self.check_simple_fc_convergence(True) + + def check_simple_fc_parallel_accuracy(self, use_cuda): + if use_cuda and not core.is_compiled_with_cuda(): + return + + img, label = self._init_data() + + single_first_loss, single_last_loss = self.check_network_convergence( + method=simple_fc_net, + seed=1, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + use_parallel_executor=False) + parallel_first_loss, parallel_last_loss = self.check_network_convergence( + method=simple_fc_net, + seed=1, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + use_parallel_executor=True) + + self.assertAlmostEquals( + np.mean(parallel_first_loss), + single_first_loss, + delta=1e-6, ) + self.assertAlmostEquals( + np.mean(parallel_last_loss), single_last_loss, delta=1e-6) + + def test_simple_fc_parallel_accuracy(self): + self.check_simple_fc_parallel_accuracy(True) + + +if __name__ == '__main__': + unittest.main()