diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index fb3d8ef8d53436f387acc3069a0eb887e6f07c59..8b7d91f234594becdda805c089fac0bb4e4e8e44 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library") set(ANAKIN_COMPILE_EXTRA_FLAGS + -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable -Wno-error=unused-variable -Wno-unused-variable -Wno-error=format-extra-args -Wno-format-extra-args -Wno-error=comment -Wno-comment @@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-reorder -Wno-error=cpp) -set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz") +set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz") # A helper function used in Anakin, currently, to use it, one need to recursively include # nearly all the header files. @@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}") message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}") execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*") - execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}") + execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}") execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") - execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz") + execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz") endif() if (WITH_ANAKIN) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 5f3bfa296546fcbc6a3410d7ae072ff74954bc74..3ef317bb7a1c25c5738342f34ae7994b0184a7de 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -263,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) @@ -306,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', ' paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) +paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 139411f3e0d945f9265d19a28487c05d06722d69..6440607dbe4666ff3ff91dc526465706b3b9c1f0 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -7,6 +7,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) +cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) if(WITH_GPU) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) else() diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 60382faffb8e53870658b2d1ff83abc4008cb4cf..1a9ce746ea840bc088d222cc4e9bc05159d64734 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -17,6 +17,8 @@ #include #include +using float16 = paddle::platform::float16; + namespace paddle { namespace framework { @@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() { RegisterType(retv, proto_type, #cc_type) // NOTE: Add your customize type here. - RegType(platform::float16, proto::VarType::FP16); + RegType(float16, proto::VarType::FP16); RegType(float, proto::VarType::FP32); RegType(double, proto::VarType::FP64); RegType(int, proto::VarType::INT32); diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..54c41c55ba63c0b2001cfcb6a9e94fbb0036d437 --- /dev/null +++ b/paddle/fluid/framework/data_type_test.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/data_type.h" + +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor.h" + +TEST(DataType, float16) { + using paddle::framework::Tensor; + using paddle::platform::CPUPlace; + using paddle::platform::float16; + namespace f = paddle::framework; + f::proto::VarType::Type dtype = f::proto::VarType::FP16; + + Tensor tensor; + CPUPlace cpu; + tensor.mutable_data(cpu, f::ToTypeIndex(dtype)); + + // test fp16 tensor + EXPECT_EQ(tensor.type(), std::type_index(typeid(float16))); + + // test fp16 size + EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u); + + // test debug info + std::string type = "float16"; + EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str()); +} diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h new file mode 100644 index 0000000000000000000000000000000000000000..6e302a29233b96451df14b4685911be1cd87c1ab --- /dev/null +++ b/paddle/fluid/framework/details/exception_holder.h @@ -0,0 +1,83 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace details { + +class ExceptionHolder { + public: + void Catch(const platform::EnforceNotMet& exp) { + std::lock_guard lock(mu_); + exception_.reset(new platform::EnforceNotMet(exp)); + type_ = kEnforceNotMet; + } + + void Catch(const platform::EOFException& exp) { + std::lock_guard lock(mu_); + // EOFException will not cover up existing EnforceNotMet. + if (exception_.get() == nullptr) { + exception_.reset(new platform::EOFException(exp)); + type_ = kEOF; + } + } + + bool ExceptionCatched() const { + std::lock_guard lock(mu_); + return exception_.get() != nullptr; + } + + void Throw() { + std::lock_guard lock(mu_); + switch (type_) { + case kNone: + break; + case kEnforceNotMet: { + auto e = *static_cast(exception_.get()); + throw e; + break; + } + case kEOF: { + auto e = *static_cast(exception_.get()); + throw e; + break; + } + default: + LOG(FATAL) << "Unknown exception."; + } + exception_.reset(); + type_ = kNone; + } + + void Clear() { + std::lock_guard lock(mu_); + exception_.reset(); + type_ = kNone; + } + + private: + enum ExceptionType { kNone, kEnforceNotMet, kEOF }; + ExceptionType type_{kNone}; + + std::unique_ptr exception_; + mutable std::mutex mu_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 1b188aec5995edb73835bcf5b851952db0f95f48..5e87e0bf50b51d2b630aba06a5907dd721754d1f 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -41,7 +41,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector var_infos, std::vector places, std::unique_ptr&& underlying_executor); - const ir::Graph& Graph() const { return underlying_executor_->Graph(); } + const ir::Graph& Graph() const override { + return underlying_executor_->Graph(); + } FeedFetchList Run(const std::vector& fetch_tensors) override; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index eec405073377b2782d7636c08e6eb3a7bd41202d..e556c84b0219eba2b92c456c205e03947171626b 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Clean run context run_op_futures_.clear(); - exception_.reset(); + exception_holder_.Clear(); // Step 3. Execution while (!pending_vars.empty()) { @@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto cur_ready_vars = ready_vars.PopAll(1, &timeout); if (timeout) { - std::unique_lock l(exception_mu_); - if (exception_) { - l.unlock(); + if (exception_holder_.ExceptionCatched()) { for (auto &run_op_future : run_op_futures_) { run_op_future.wait(); } - l.lock(); - std::exception *exp = exception_.get(); - if (dynamic_cast(exp)) { - auto e = *static_cast(exp); - throw e; - } else if (dynamic_cast(exp)) { - auto e = *static_cast(exp); - throw e; - } else { - LOG(FATAL) << "Unknown exception."; - } + exception_holder_.Throw(); } else { continue; } @@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp( ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; } catch (platform::EOFException ex) { - std::lock_guard l(exception_mu_); - // EOFException will not cover up existing EnforceNotMet. - if (exception_.get() == nullptr) { - exception_.reset(new platform::EOFException(ex)); - } + exception_holder_.Catch(ex); } catch (platform::EnforceNotMet ex) { - std::lock_guard l(exception_mu_); - exception_.reset(new platform::EnforceNotMet(ex)); + exception_holder_.Catch(ex); } catch (...) { LOG(FATAL) << "Unknown exception catched"; } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 82d6b5272aba161bb19067ebef054bc4bbb8701c..9135c1f5d435d5e2c60eb90c80803361aa31a3c4 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -24,6 +24,7 @@ #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" @@ -42,7 +43,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { const std::vector &places, std::unique_ptr &&graph); - const ir::Graph &Graph() const { return *graph_; } + const ir::Graph &Graph() const override { return *graph_; } // Run a SSAGraph by a thread pool // Use topological sort algorithm FeedFetchList Run(const std::vector &fetch_tensors) override; @@ -59,8 +60,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; - std::mutex exception_mu_; - std::unique_ptr exception_; + ExceptionHolder exception_holder_; std::atomic running_ops_; void InsertPendingOp(std::unordered_map *pending_ops, diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc index b517442bb73f43bc1cb1d639b6c6cf004b28d4cf..a260dd3da2a7863c06e51aa4feafd824ea254139 100644 --- a/paddle/fluid/framework/ir/graph_helper_test.cc +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -116,8 +116,8 @@ TEST(GraphHelperTest, Basic) { for (size_t i = 0; i < sorted.size(); ++i) { node_map[sorted[i]->Name()] = i; } - ASSERT_EQ(node_map.at("op1"), 0); - ASSERT_EQ(node_map.at("op2"), 1); + ASSERT_EQ(node_map.at("op1"), 0UL); + ASSERT_EQ(node_map.at("op2"), 1UL); ASSERT_TRUE(node_map.at("op3") < node_map.at("op5")); } } // namespace ir diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index 73ef55756c330bdbc3be89c436967b2a88625a43..f9e6bdf3625bdced9d1a9195a979b0f46016d8bf 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -97,15 +97,15 @@ TEST(GraphTest, Basic) { std::vector nodes(g->Nodes().begin(), g->Nodes().end()); for (ir::Node *n : nodes) { if (n->Name() == "sum") { - ASSERT_EQ(n->inputs.size(), 3); - ASSERT_EQ(n->outputs.size(), 1); + ASSERT_EQ(n->inputs.size(), 3UL); + ASSERT_EQ(n->outputs.size(), 1UL); } else if (n->Name() == "test_a" || n->Name() == "test_b" || n->Name() == "test_c") { - ASSERT_EQ(n->inputs.size(), 0); - ASSERT_EQ(n->outputs.size(), 1); + ASSERT_EQ(n->inputs.size(), 0UL); + ASSERT_EQ(n->outputs.size(), 1UL); } else if (n->Name() == "test_out") { - ASSERT_EQ(n->inputs.size(), 1); - ASSERT_EQ(n->outputs.size(), 0); + ASSERT_EQ(n->inputs.size(), 1UL); + ASSERT_EQ(n->outputs.size(), 0UL); } } ASSERT_EQ(nodes.size(), 5); diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc index db95861c510b52a5b52229541434e6437d3fb9f4..3e17a512ce154de88ac890f3b29f03385595d95c 100644 --- a/paddle/fluid/framework/op_kernel_type_test.cc +++ b/paddle/fluid/framework/op_kernel_type_test.cc @@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) { ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type), "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type[" "CUDNN]"); + + using CUDAPlace = paddle::platform::CUDAPlace; + OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW, + LibraryType::kCUDNN); + ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2), + "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_" + "type[CUDNN]"); } TEST(OpKernelType, Hash) { diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 001b5cb5a8eb57cbe0a2e0ad7f64ef05f8149922..2288c7fe6609a765612b468d69ad35101b92b384 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput( return OpProtoAndCheckerMaker::VariableBuilder{output}; } +void OpProtoAndCheckerMaker::Reuse(const std::string& name, + const std::string& reused_name) { + bool found = false; + proto::OpProto::Var* var; + + for (auto& var : proto_->inputs()) { + if (var.name() == reused_name) { + found = true; + break; + } + } + PADDLE_ENFORCE(found == true, + "Input/Output name: %s reused_name: %s, one of them is not " + "exists or not matched.", + name, reused_name); + + found = false; + for (int i = 0; i < proto_->outputs().size(); ++i) { + var = proto_->mutable_outputs()->Mutable(i); + if (var->name() == name) { + PADDLE_ENFORCE(!var->has_reuse(), + "Output(%s) has been set reused var of %s", name, + var->reuse()); + found = true; + var->set_reuse(reused_name); + break; + } + } + PADDLE_ENFORCE(found == true, + "Input/Output name: %s reused_name: %s, one of them is not " + "exists or not matched.", + name, reused_name); +} + void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() { std::unordered_set names; auto checker = [&](const std::string& name) { diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 92f86bb5de520878d0a7b8d7214620580242c061..80970291c9c234f1306162f4ffa3c2528f88c35f 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker { VariableBuilder AddOutput(const std::string &name, const std::string &comment); + void Reuse(const std::string &name, const std::string &reused_name); + template TypedAttrChecker &AddAttr(const std::string &name, const std::string &comment, diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc index 58f70cb39c0d96ed3b9ff35ea132ba75a37f5405..b71c7b646857e11f291748c4c7c2af92b6d53231 100644 --- a/paddle/fluid/framework/op_proto_maker_test.cc +++ b/paddle/fluid/framework/op_proto_maker_test.cc @@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) { } class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "input of test op"); + AddOutput("XOut", "output of test op").Reuse("X"); + } +}; + +class TestInplaceProtoMaker2 + : public paddle::framework::OpProtoAndCheckerMaker { public: void Make() { AddInput("X", "input of test op"); @@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { }; TEST(ProtoMaker, InplaceOutput) { - paddle::framework::proto::OpProto op_proto; + paddle::framework::proto::OpProto op_proto, op_proto2; paddle::framework::OpAttrChecker op_checker; TestInplaceProtoMaker proto_maker; - ASSERT_THROW(proto_maker(&op_proto, &op_checker), + TestInplaceProtoMaker2 proto_maker2; + + proto_maker(&op_proto, &op_checker); + + ASSERT_THROW(proto_maker2(&op_proto2, &op_checker), paddle::platform::EnforceNotMet); - // proto_maker(&op_proto, &op_checker); - // proto_maker.Make(); - // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); } + +// normal reuse +class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "input of test op"); + AddInput("Y", "input of test op"); + AddOutput("Out", "output of test op"); + AddOutput("XOut", "output of test op"); + // avoid destructor exception. + // Validate(); + TestReuse(); + } + + virtual void TestReuse() {} +}; + +// test duplicate reuse error +class TestReuseProtoMaker2 : public TestReuseProtoMaker { + public: + void TestReuse() { + Reuse("Out", "X"); + Reuse("Out", "Y"); + } +}; + +// NotExists Input +class TestReuseProtoMaker3 : public TestReuseProtoMaker { + public: + void TestReuse() { + Reuse("Out", "NotExists"); + Reuse("XOut", "X"); + } +}; + +// NotExists Output +class TestReuseProtoMaker4 : public TestReuseProtoMaker { + public: + void TestReuse() { Reuse("NotExists", "X"); } +}; + +TEST(ProtoMaker, Reuse) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + TestReuseProtoMaker proto_maker; + proto_maker(&op_proto, &op_checker); +} + +// NOTE(dzhwinter): +// There is a Fatal CHECK on base class destructor, which will call abort inside +// instead of +// throw an exception. If we throw an exception in Make(), we will trigger the +// CHECK and terminate the tests. +// +// I had tried to replace the default CHECK with a exception, however, it's +// still not supported by glog. +// the details: +// https://github.com/google/glog/issues/249 +// https://github.com/facebookresearch/TensorComprehensions/issues/351 +/* +TEST(ProtoMaker, ReuseWithException) { + paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4; + paddle::framework::OpAttrChecker op_checker; + TestReuseProtoMaker2 proto_maker2; + TestReuseProtoMaker3 proto_maker3; + TestReuseProtoMaker4 proto_maker4; + EXPECT_THROW(proto_maker2(&op_proto2, &op_checker), + paddle::platform::EnforceNotMet); + + EXPECT_THROW(proto_maker3(&op_proto3, &op_checker), + paddle::platform::EnforceNotMet); + + EXPECT_THROW(proto_maker4(&op_proto4, &op_checker), + paddle::platform::EnforceNotMet); +} + +void FailureFunction() { + throw std::runtime_error("Check failed in destructor."); + // return 0; +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + google::InstallFailureFunction(&FailureFunction); + return RUN_ALL_TESTS(); +} +*/ diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 7c1c29fd9a81c558f7fd05abf52cd0a6dd522190..cdac00739bc48648b41751e644a953d0d310ffbf 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/var_type.h" @@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name, } if (var->IsType()) { - return var->Get().dims(); + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return DDim({-1}); + } + return tensor.dims(); } else if (var->IsType()) { if (get_actual_dim) { return var->Get().value().dims(); @@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name, } } +static std::string GetDtype(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + return ""; + } + + if (var->IsType()) { + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return ""; + } + return DataTypeToString(ToDataType(tensor.type())); + } else if (var->IsType()) { + return DataTypeToString( + ToDataType(var->Get().value().type())); + } else { + return ""; + } +} + static int GetRowSize(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); if (var == nullptr) { @@ -91,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } if (var->IsType()) { - return var->Get().lod(); + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return default_lod; + } + return tensor.lod(); } else { return default_lod; } @@ -172,6 +201,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { if (row_size >= 0) { ss << "[row_size=" << row_size << "]"; } + std::string dtype = GetDtype(*scope, input.second[i]); + ss << ":" << dtype; ss << "[" << GetDims(*scope, input.second[i], true) << "]"; ss << "(" << GetLoD(*scope, input.second[i]) << ")"; } diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index ef224d68f1fc561f45e9d7a81425e62655457648..0bbfd66148e9bc9080654bf1b0b34477115a0e6b 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -82,7 +82,7 @@ class Tensor { template const T* data() const; - bool IsInitialized() const; + inline bool IsInitialized() const; /** * @brief Return a pointer to mutable memory block. diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 0a1cb6d5703dace5e6be73285655ecd9d2ad89fb..cb2061c06a429d8e8116001a4aa4e8c46ea13428 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/tensor.h" #include #include +#include "paddle/fluid/platform/float16.h" namespace framework = paddle::framework; namespace platform = paddle::platform; @@ -213,3 +214,17 @@ TEST(Tensor, Layout) { src.set_layout(framework::DataLayout::kAnyLayout); ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); } + +TEST(Tensor, FP16) { + using platform::float16; + framework::Tensor src; + float16* src_ptr = src.mutable_data({2, 3}, platform::CPUPlace()); + for (int i = 0; i < 2 * 3; ++i) { + src_ptr[i] = static_cast(i); + } + EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16)); + // EXPECT a human readable error message + // src.data(); + // Tensor holds the wrong type, it holds N6paddle8platform7float16E at + // [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43] +} diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 67d355d10d3c9e11b59c9ce9d208826523095459..27fe575cb6167a726ff92a8f3d2e47b6f536ba39 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph tensorrt_subgraph_node_mark_pass.cc analyzer.cc helper.cc + model_store_pass.cc DEPS framework_proto proto_desc) cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) +cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) @@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_ inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc) +inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index b3a1075e5adf4a24bf32017574c061f36c46ba8c..98bdfcc00b9f0e8f40dfc92e4021b2bd6fb19313 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/inference/analysis/model_store_pass.h" #include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" @@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, DEFINE_string(inference_analysis_graphviz_log_root, "./", "Graphviz debuger for data flow graphs."); +DEFINE_string(inference_analysis_output_storage_path, "", + "optimized model output path"); + namespace inference { namespace analysis { @@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager { AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller)); } AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); + if (!FLAGS_inference_analysis_output_storage_path.empty()) { + AddPass("model-store-pass", new ModelStorePass); + } } std::string repr() const override { return "dfg-pass-manager"; } diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 0132bf5b9c6552391aaa19542669487f42b685a7..c82fdfff86c91b4e07e3c1b80987d3d8d796ad23 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -16,28 +16,23 @@ limitations under the License. */ /* * This file contains Analyzer, an class that exposed as a library that analyze - * and optimize - * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to - * control whether - * an process is applied on the program. + * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has + * multiple flags to + * control whether an process is applied on the program. * * The processes are called Passes in analysis, the Passes are placed in a - * pipeline, the first - * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to - * a data flow - * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow - * graph to a - * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes - * which take a - * node or data flow graph as input. + * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a + * Fluid ProgramDesc to + * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms + * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the + * pipeline can be any Passes + * which take a node or data flow graph as input. * * The Analyzer can be used in two methods, the first is a executable file which - * can be used to - * pre-process the inference model and can be controlled by passing difference - * command flags; + * can be used to pre-process the inference model and can be controlled by + * passing difference command flags; * the other way is to compose inside the inference API as a runtime pre-process - * phase in the - * inference service. + * phase in the inference service. */ #include @@ -50,6 +45,7 @@ namespace paddle { // flag if not available. DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine); DECLARE_string(inference_analysis_graphviz_log_root); +DECLARE_string(inference_analysis_output_storage_path); namespace inference { namespace analysis { diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/analysis/analyzer_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e1fe3eb797cdced56a61aa2db0c3d18601824f8 --- /dev/null +++ b/paddle/fluid/inference/analysis/analyzer_main.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file implements analysizer -- an executation help to analyze and + * optimize trained model. + */ +#include "paddle/fluid/inference/analysis/analyzer.h" +#include +#include + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + using paddle::inference::analysis::Analyzer; + using paddle::inference::analysis::Argument; + + Argument argument; + Analyzer analyzer; + analyzer.Run(&argument); + + return 0; +} diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 25a440e7e71fddb38cc515f99d15231675a8172e..24bfb3993cf569561980006b6627b56327dd0f67 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -20,14 +20,18 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, analysis_without_tensorrt) { +TEST(Analyzer, analysis_without_tensorrt) { FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false; + Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } -TEST_F(DFG_Tester, analysis_with_tensorrt) { +TEST(Analyzer, analysis_with_tensorrt) { FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true; + Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 6d316f20bff7a68754b0afec6463bd5d7579227f..a17d6281a2976f0600c7ce94c2d43e65d30de265 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -23,6 +23,7 @@ #pragma once +#include #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" @@ -36,6 +37,16 @@ namespace analysis { * All the fields should be registered here for clearness. */ struct Argument { + Argument() = default; + explicit Argument(const std::string& fluid_model_dir) + : fluid_model_dir(new std::string(fluid_model_dir)) {} + // The directory of the trained model. + std::unique_ptr fluid_model_dir; + // The path of `__model__` and `param`, this is used when the file name of + // model and param is changed. + std::unique_ptr fluid_model_program_path; + std::unique_ptr fluid_model_param_path; + // The graph that process by the Passes or PassManagers. std::unique_ptr main_dfg; @@ -44,6 +55,9 @@ struct Argument { // The processed program desc. std::unique_ptr transformed_program_desc; + + // The output storage path of ModelStorePass. + std::unique_ptr model_output_store_path; }; #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index 1c60d5de21538043962cc58a6f508aea635fe8c4..16aeae4d35e7bd54646053190da7f47eaca69aa0 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -36,6 +36,8 @@ namespace analysis { /* * DataFlowGraph - A container of Value and Function Nodes. + * + * This is the base graph for any other type of graphs, such as SSA or CFG. */ struct DataFlowGraph { NodeMap nodes; @@ -174,7 +176,7 @@ struct GraphTraits { // sub-graph is the inputs nodes and output nodes that doesn't inside the // sub-graph. std::pair, std::vector> -ExtractInputAndOutputOfSubGraph(std::vector &graph); +ExtractInputAndOutputOfSubGraph(std::vector &graph); // NOLINT } // namespace analysis } // namespace inference diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc index 7912f8d7f17ae3c79e8f73f36b7095fd52c9ac86..a881262665f156812da9e1576aa29b05fc398499 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -20,7 +20,7 @@ namespace inference { namespace analysis { TEST(DataFlowGraph, BFS) { - auto desc = LoadProgramDesc(); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); dfg.Build(); @@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) { } TEST(DataFlowGraph, DFS) { - auto desc = LoadProgramDesc(); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); dfg.Build(); GraphTraits trait(&dfg); diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc index d8fc5e580a98f76233f01fdc4d7987311f78ee45..4ef381db295b986b91173a728b6d98640f6f4f51 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc @@ -26,21 +26,21 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, Test) { - DataFlowGraph graph; +TEST(DataFlowGraph, Test) { + Argument argument(FLAGS_inference_model_dir); FluidToDataFlowGraphPass pass0; DataFlowGraphToFluidPass pass1; ASSERT_TRUE(pass0.Initialize(&argument)); ASSERT_TRUE(pass1.Initialize(&argument)); - pass0.Run(&graph); - pass1.Run(&graph); + pass0.Run(argument.main_dfg.get()); + pass1.Run(argument.main_dfg.get()); pass0.Finalize(); pass1.Finalize(); - LOG(INFO) << graph.nodes.size(); + LOG(INFO) << argument.main_dfg->nodes.size(); } }; // namespace analysis diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc index 65842b1e850953e77e3d4d28416609be271af9f1..928be7917047382d9b86294f6039b26b0ebf6f49 100644 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc @@ -23,12 +23,18 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { - auto dfg = ProgramDescToDFG(*argument.origin_program_desc); +TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) { + Argument argument(FLAGS_inference_model_dir); + FluidToDataFlowGraphPass pass0; + ASSERT_TRUE(pass0.Initialize(&argument)); + pass0.Run(argument.main_dfg.get()); + + // auto dfg = ProgramDescToDFG(*argument.origin_program_desc); + DFG_GraphvizDrawPass::Config config("./", "test"); DFG_GraphvizDrawPass pass(config); pass.Initialize(&argument); - pass.Run(&dfg); + pass.Run(argument.main_dfg.get()); // test content std::ifstream file("./0-graph_test.dot"); diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc index 496921db9eabce1b1e40c7cb13089446ca93321c..511631d3e067f14bc1230d9e4b4d92dbe604e1d4 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include @@ -25,8 +26,20 @@ namespace analysis { bool FluidToDataFlowGraphPass::Initialize(Argument *argument) { ANALYSIS_ARGUMENT_CHECK_FIELD(argument); - ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc); - PADDLE_ENFORCE(argument); + if (argument->origin_program_desc) { + LOG(WARNING) << "argument's origin_program_desc is already set, might " + "duplicate called"; + } + if (!argument->fluid_model_program_path) { + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir); + argument->fluid_model_program_path.reset( + new std::string(*argument->fluid_model_dir + "/__model__")); + } + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path); + auto program = LoadProgramDesc(*argument->fluid_model_program_path); + argument->origin_program_desc.reset( + new framework::proto::ProgramDesc(program)); + if (!argument->main_dfg) { argument->main_dfg.reset(new DataFlowGraph); } @@ -40,6 +53,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { PADDLE_ENFORCE(graph); PADDLE_ENFORCE(desc_); // insert vars + // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id + // will keep updating to its latest alias during the graph-building. std::unordered_map var2id; auto &main_block = desc_->blocks(framework::kRootBlockIndex); for (int i = 0; i < main_block.vars_size(); i++) { @@ -51,6 +66,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { var2id[var.name()] = v->id(); } + // The variables in a SSA can only write once, so if a variable is written + // multiple times(quite common in our ProgramDesc design), multiple alias + // Nodes of this variable will be created, and each will just write once. + + // An set that keep all the names of the variables(the original, not alias) + // that have been written(as outputs). Once an Op's output variable hit the + // set, it should create a new alias and update the global alias for this + // variable. And that make a Data Flow Graph a SSA. + std::unordered_set unique_written_vars; for (int i = 0; i < main_block.ops_size(); i++) { const auto &op = main_block.ops(i); auto *o = graph->nodes.Create(Node::Type::kFunction); @@ -62,33 +86,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { o->SetPbMsg(op.SerializeAsString()); // set inputs and outputs - std::unordered_set inlinks; for (int j = 0; j < op.inputs_size(); j++) { auto &in_var = op.inputs(j); for (int k = 0; k < in_var.arguments_size(); k++) { auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k))); in->outlinks.push_back(o); o->inlinks.push_back(in); - inlinks.insert(in); } } for (int j = 0; j < op.outputs_size(); j++) { auto &out_var = op.outputs(j); for (int k = 0; k < out_var.arguments_size(); k++) { auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]); - if (inlinks.count(out)) { + if (unique_written_vars.count(out)) { // Loop found, for example, a = op(a), use SSA, change to a1 = op(a). auto *out_alias = graph->nodes.Create(Node::Type::kValue); out_alias->SetName(out->name()); out_alias->SetPbDesc(out->pb_desc()); out_alias->SetPbMsg(out->pb_msg()); - var2id[out_alias->name()] = out_alias->id(); // update a -> a0 + var2id[out_alias->name()] = + out_alias->id(); // update variable's alias Node LOG(INFO) << "loop found in graph, create SSA alias node [" << out_alias->repr() << "] for [" << out->repr() << "]"; out = out_alias; } out->inlinks.push_back(o); o->outlinks.push_back(out); + unique_written_vars.insert(out); } } } diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h index da8463b63bd0bb1633bfcb9d7d41a884ddd632c7..fb948bf2242abcbc1e841fd3b8457e63358782c5 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h @@ -30,7 +30,7 @@ namespace inference { namespace analysis { /* - * Transform a FluidDesc to a data flow graph. + * Transform a FluidDesc to a SSA. */ class FluidToDataFlowGraphPass final : public DataFlowGraphPass { public: diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc index dadb84059d21adab44159a6145b345460663cb96..d218dcd05015aa4636c16569de4addf4936c8cd5 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc @@ -21,8 +21,9 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, Init) { +TEST(FluidToDataFlowGraphPass, Test) { FluidToDataFlowGraphPass pass; + Argument argument(FLAGS_inference_model_dir); pass.Initialize(&argument); pass.Run(argument.main_dfg.get()); // Analysis is sensitive to ProgramDesc, careful to change the original model. diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index f1064cd20f28092d80d3fd23a862da080b6cc2f3..a0f912b251d5ea29594a7f601d5b2bce91201790 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) { } } +static framework::proto::ProgramDesc LoadProgramDesc( + const std::string &model_path) { + std::ifstream fin(model_path, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path); + fin.seekg(0, std::ios::end); + std::string buffer(fin.tellg(), ' '); + fin.seekg(0, std::ios::beg); + fin.read(&buffer[0], buffer.size()); + fin.close(); + framework::proto::ProgramDesc program_desc; + program_desc.ParseFromString(buffer); + return program_desc; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c429176424bd5c1d8fa5e015c19d698f966880e --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/model_store_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void ModelStorePass::Run(DataFlowGraph *x) { + if (!argument_->fluid_model_param_path) { + PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir); + argument_->fluid_model_param_path.reset( + new std::string(*argument_->fluid_model_dir + "param")); + } + PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path); + // Directly copy param file to destination. + std::stringstream ss; + // NOTE these commands only works on linux. + ss << "mkdir -p " << *argument_->model_output_store_path; + LOG(INFO) << "run command: " << ss.str(); + PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); + ss.str(""); + + ss << "cp " << *argument_->fluid_model_dir << "/*" + << " " << *argument_->model_output_store_path; + LOG(INFO) << "run command: " << ss.str(); + PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); + + // Store program + PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc, + "program desc is not transformed, should call " + "DataFlowGraphToFluidPass first."); + const std::string program_output_path = + *argument_->model_output_store_path + "/__model__"; + std::ofstream file(program_output_path, std::ios::binary); + PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.", + program_output_path); + const std::string serialized_message = + argument_->transformed_program_desc->SerializeAsString(); + file.write(serialized_message.c_str(), serialized_message.size()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..fac7083925776b6209d49255c9e67b930cb1250b --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass.h @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines ModelStorePass, which store the runtime DFG to a Paddle + * model in the disk, and that model can be reloaded for prediction. + */ + +#pragma once +#include +#include "paddle/fluid/inference/analysis/pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +class ModelStorePass : public DataFlowGraphPass { + public: + bool Initialize(Argument* argument) override { + if (!argument) { + LOG(ERROR) << "invalid argument"; + return false; + } + argument_ = argument; + return true; + } + + void Run(DataFlowGraph* x) override; + + std::string repr() const override { return "DFG-store-pass"; } + std::string description() const override { + return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle + model in the disk, and that model can be reloaded for prediction again.)DD"; + } + + private: + Argument* argument_{nullptr}; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..5f3526dd504e77e58d79b4f675db86a22fd0f26b --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/model_store_pass.h" + +#include +#include +#include "paddle/fluid/inference/analysis/analyzer.h" + +namespace paddle { +namespace inference { +namespace analysis { + +DEFINE_string(inference_model_dir, "", "Model path"); + +TEST(DFG_StorePass, test) { + Analyzer analyzer; + Argument argument(FLAGS_inference_model_dir); + argument.model_output_store_path.reset( + new std::string("./_dfg_store_pass_tmp")); + // disable storage in alalyzer + FLAGS_inference_analysis_output_storage_path = ""; + analyzer.Run(&argument); + + ModelStorePass pass; + pass.Initialize(&argument); + pass.Run(argument.main_dfg.get()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h index 6b4dbb3bb5ddd9f15f26758beef1d1b5bbf49142..6806f9ff7dada2c1e2328e1ffbfd225afefcf474 100644 --- a/paddle/fluid/inference/analysis/pass.h +++ b/paddle/fluid/inference/analysis/pass.h @@ -50,6 +50,7 @@ class Pass { // Create a debugger Pass that draw the DFG by graphviz toolkit. virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; } + virtual void Run() { LOG(FATAL) << "not valid"; } // Run on a single Node. virtual void Run(Node *x) { LOG(FATAL) << "not valid"; } // Run on a single Function. diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc index dac1c509d728114bd24a2ea1150c407646026fd4..13423e4837e12a96e7a5dfc9ca3f59bf8b14746a 100644 --- a/paddle/fluid/inference/analysis/pass_manager_tester.cc +++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc @@ -56,7 +56,7 @@ class TestNodePass final : public NodePass { std::string description() const override { return "some doc"; } }; -TEST_F(DFG_Tester, DFG_pass_manager) { +TEST(PassManager, DFG_pass_manager) { TestDfgPassManager manager; DFG_GraphvizDrawPass::Config config("./", "dfg.dot"); @@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) { manager.Register("graphviz", new DFG_GraphvizDrawPass(config)); manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass); + Argument argument(FLAGS_inference_model_dir); + ASSERT_TRUE(&argument); ASSERT_TRUE(manager.Initialize(&argument)); manager.RunAll(); } -TEST_F(DFG_Tester, Node_pass_manager) { +TEST(PassManager, Node_pass_manager) { + Argument argument(FLAGS_inference_model_dir); // Pre-process: initialize the DFG with the ProgramDesc first. FluidToDataFlowGraphPass pass0; pass0.Initialize(&argument); diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 67dd4da54b95add703428e1fded61065f60353e8..39cc433b40fad17f4f12359d4e907a250a88bd63 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) { return false; }; -TEST_F(DFG_Tester, Split) { - auto desc = LoadProgramDesc(); +TEST(SubGraphSplitter, Split) { + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); LOG(INFO) << "spliter\n" << dfg.DotString(); @@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) { ASSERT_EQ(subgraphs.back().size(), 6UL); } -TEST_F(DFG_Tester, Fuse) { - auto desc = LoadProgramDesc(); +TEST(SubGraphSplitter, Fuse) { + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); size_t count0 = dfg.nodes.size(); diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc index a6c15e848b99ca318f4583e3d4b88345fe8e5ebc..c1d932878e559180af987594535959afdf475587 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc @@ -22,11 +22,11 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) { +TEST(TensorRTSubgraphNodeMarkPass, test) { // init FluidToDataFlowGraphPass pass; + Argument argument(FLAGS_inference_model_dir); ASSERT_TRUE(pass.Initialize(&argument)); - argument.main_dfg.reset(new DataFlowGraph); pass.Run(argument.main_dfg.get()); TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) { @@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) { for (auto& node : argument.main_dfg->nodes.nodes()) { counter += node->attr(ATTR_supported_by_tensorrt).Bool(); } - + ASSERT_EQ(counter, 2); LOG(INFO) << counter << " nodes marked"; } diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc index 1d749d3fa3f39b351ccee6ebeb82467f7220a0b6..67a5af83d89b771536ea11be51b35244ff5c09d6 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc @@ -25,7 +25,7 @@ namespace analysis { DEFINE_string(dot_dir, "./", ""); -TEST_F(DFG_Tester, tensorrt_single_pass) { +TEST(TensorRTSubGraphPass, main) { std::unordered_set teller_set( {"elementwise_add", "mul", "sigmoid"}); SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) { @@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) { return false; }; - LOG(INFO) << "init"; + Argument argument(FLAGS_inference_model_dir); + DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"}; DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"}; @@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) { FluidToDataFlowGraphPass pass0; TensorRTSubGraphPass trt_pass(std::move(teller)); - LOG(INFO) << "Initialize"; dfg_pass.Initialize(&argument); dfg_pass1.Initialize(&argument); pass0.Initialize(&argument); trt_pass.Initialize(&argument); - LOG(INFO) << "Run"; argument.main_dfg.reset(new DataFlowGraph); pass0.Run(argument.main_dfg.get()); dfg_pass.Run(argument.main_dfg.get()); diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h index ce1191a567a4198f003520c40bf02487c48c56eb..1073a6f686eaeeaaae2d93ab044149b7df518085 100644 --- a/paddle/fluid/inference/analysis/ut_helper.h +++ b/paddle/fluid/inference/analysis/ut_helper.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/analysis/helper.h" namespace paddle { namespace inference { @@ -32,27 +32,12 @@ namespace analysis { DEFINE_string(inference_model_dir, "", "inference test model dir"); -static framework::proto::ProgramDesc LoadProgramDesc( - const std::string& model_dir = FLAGS_inference_model_dir) { - std::string msg; - std::string net_file = FLAGS_inference_model_dir + "/__model__"; - std::ifstream fin(net_file, std::ios::in | std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", net_file); - fin.seekg(0, std::ios::end); - msg.resize(fin.tellg()); - fin.seekg(0, std::ios::beg); - fin.read(&(msg.at(0)), msg.size()); - fin.close(); - framework::proto::ProgramDesc program_desc; - program_desc.ParseFromString(msg); - return program_desc; -} - static DataFlowGraph ProgramDescToDFG( const framework::proto::ProgramDesc& desc) { DataFlowGraph graph; FluidToDataFlowGraphPass pass; Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); pass.Initialize(&argument); pass.Run(&graph); @@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG( class DFG_Tester : public ::testing::Test { protected: void SetUp() override { - auto desc = LoadProgramDesc(FLAGS_inference_model_dir); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); } diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 3e60a61793339990648737c3d549d46cc5f5a887..259d79bedbf664f52b1189ca71567665a6d91180 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -19,6 +19,7 @@ endif(APPLE) set(inference_deps paddle_inference_api paddle_fluid_api) + if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) endif() @@ -63,6 +64,8 @@ endif() if (WITH_ANAKIN) # only needed in CI # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's, # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to + # compile the libinference_anakin_api.a and compile with anakin.so. + fetch_include_recursively(${ANAKIN_INCLUDE}) # compile the libinference_anakin_api.a and anakin.so. nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc) nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc) @@ -73,7 +76,7 @@ if (WITH_ANAKIN) # only needed in CI if (WITH_TESTING) cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin - DEPS inference_anakin_api) + DEPS inference_anakin_api_shared) target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endif(WITH_TESTING) endif() diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc index 0206ac60103759deda91be741617bde63e003de6..6b374ceefbc180a5c22abe591f12e1c3d89bc64a 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.cc +++ b/paddle/fluid/inference/api/api_anakin_engine.cc @@ -18,26 +18,36 @@ namespace paddle { -PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( +template +PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( const AnakinConfig &config) { CHECK(Init(config)); } -bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { +template +bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { if (!(graph_.load(config.model_file))) { + LOG(FATAL) << "fail to load graph from " << config.model_file; return false; } - graph_.ResetBatchSize("input_0", config.max_batch_size); + auto inputs = graph_.get_ins(); + for (auto &input_str : inputs) { + graph_.ResetBatchSize(input_str, config.max_batch_size); + } // optimization for graph if (!(graph_.Optimize())) { return false; } // construct executer - executor_.init(graph_); + if (executor_p_ == nullptr) { + executor_p_ = new anakin::Net(graph_, true); + } return true; } -bool PaddleInferenceAnakinPredictor::Run( +template +bool PaddleInferenceAnakinPredictor::Run( const std::vector &inputs, std::vector *output_data, int batch_size) { for (const auto &input : inputs) { @@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run( << "'s type is not float"; return false; } - auto d_tensor_in_p = executor_.get_in(input.name); + auto d_tensor_in_p = executor_p_->get_in(input.name); + auto net_shape = d_tensor_in_p->valid_shape(); + if (net_shape.size() != input.shape.size()) { + LOG(ERROR) << " input " << input.name + << "'s shape size should be equal to that of net"; + return false; + } + int sum = 1; + for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; }); + if (sum > net_shape.count()) { + graph_.Reshape(input.name, input.shape); + delete executor_p_; + executor_p_ = new anakin::Net(graph_, true); + d_tensor_in_p = executor_p_->get_in(input.name); + } + + anakin::saber::Shape tmp_shape; + for (auto s : input.shape) { + tmp_shape.push_back(s); + } + d_tensor_in_p->reshape(tmp_shape); + float *d_data_p = d_tensor_in_p->mutable_data(); if (cudaMemcpy(d_data_p, static_cast(input.data.data()), d_tensor_in_p->valid_size() * sizeof(float), @@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run( } cudaStreamSynchronize(NULL); } - - executor_.prediction(); + cudaDeviceSynchronize(); + executor_p_->prediction(); + cudaDeviceSynchronize(); if (output_data->empty()) { LOG(ERROR) << "At least one output should be set with tensors' names."; return false; } for (auto &output : *output_data) { - auto *tensor = executor_.get_out(output.name); - output.shape = tensor->shape(); + auto *tensor = executor_p_->get_out(output.name); + output.shape = tensor->valid_shape(); if (output.data.length() < tensor->valid_size() * sizeof(float)) { output.data.Resize(tensor->valid_size() * sizeof(float)); } @@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run( return true; } -anakin::Net - &PaddleInferenceAnakinPredictor::get_executer() { - return executor_; +template +anakin::Net + &PaddleInferenceAnakinPredictor::get_executer() { + return *executor_p_; } // the cloned new Predictor of anakin share the same net weights from original // Predictor -std::unique_ptr PaddleInferenceAnakinPredictor::Clone() { +template +std::unique_ptr +PaddleInferenceAnakinPredictor::Clone() { VLOG(3) << "Anakin Predictor::clone"; - std::unique_ptr cls(new PaddleInferenceAnakinPredictor()); + std::unique_ptr cls( + new PaddleInferenceAnakinPredictor()); // construct executer from other graph auto anakin_predictor_p = - dynamic_cast(cls.get()); + dynamic_cast *>(cls.get()); if (!anakin_predictor_p) { LOG(ERROR) << "fail to call Init"; return nullptr; @@ -103,14 +140,28 @@ std::unique_ptr PaddleInferenceAnakinPredictor::Clone() { return std::move(cls); } +template class PaddleInferenceAnakinPredictor; +template class PaddleInferenceAnakinPredictor; + // A factory to help create difference predictor. template <> std::unique_ptr CreatePaddlePredictor< AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) { VLOG(3) << "Anakin Predictor create."; - std::unique_ptr x( - new PaddleInferenceAnakinPredictor(config)); - return x; -} + if (config.target_type == AnakinConfig::NVGPU) { + VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ]."; + std::unique_ptr x( + new PaddleInferenceAnakinPredictor(config)); + return x; + } else if (config.target_type == AnakinConfig::X86) { + VLOG(3) << "Anakin Predictor create on [ Intel X86 ]."; + std::unique_ptr x( + new PaddleInferenceAnakinPredictor(config)); + return x; + } else { + VLOG(3) << "Anakin Predictor create on unknown platform."; + return nullptr; + } +}; } // namespace paddle diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h index def096c867ec85624f5b221782ef8b6240923c05..836badd9799228c6c294dcad5df73d039d36a1ff 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -20,14 +20,16 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/inference/api/paddle_inference_api.h" -// from anakin #include "framework/core/net/net.h" +#include "framework/graph/graph.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "saber/core/shape.h" #include "saber/saber_types.h" namespace paddle { +template class PaddleInferenceAnakinPredictor : public PaddlePredictor { public: PaddleInferenceAnakinPredictor() {} @@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { std::unique_ptr Clone() override; - anakin::Net& + anakin::Net& get_executer(); - ~PaddleInferenceAnakinPredictor() override{}; + ~PaddleInferenceAnakinPredictor() override { + delete executor_p_; + executor_p_ = nullptr; + }; private: bool Init(const AnakinConfig& config); - anakin::graph::Graph + anakin::graph::Graph graph_; - anakin::Net - executor_; + anakin::Net* + executor_p_{nullptr}; AnakinConfig config_; }; diff --git a/paddle/fluid/inference/api/api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc index d6d631bfbad4278fe99e4553a410a9d9162dcc7b..62e820b68c79a47d963bb174663bfc8c4ac22de3 100644 --- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc +++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc @@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include +#include "gflags/gflags.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" -DEFINE_string(model, "", "Directory of the inference model."); +DEFINE_string(model, "", "Directory of the inference model(mobile_v2)."); namespace paddle { AnakinConfig GetConfig() { AnakinConfig config; + // using AnakinConfig::X86 if you need to use cpu to do inference + config.target_type = AnakinConfig::NVGPU; config.model_file = FLAGS_model; config.device = 0; config.max_batch_size = 1; @@ -36,28 +38,27 @@ TEST(inference, anakin) { CreatePaddlePredictor(config); float data[1 * 3 * 224 * 224] = {1.0f}; - - PaddleTensor tensor{.name = "input_0", - .shape = std::vector({1, 3, 224, 224}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor tensor; + tensor.name = "input_0"; + tensor.shape = std::vector({1, 3, 224, 224}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::FLOAT32; // For simplicity, we set all the slots with the same data. - std::vector paddle_tensor_feeds; - paddle_tensor_feeds.emplace_back(std::move(tensor)); + std::vector paddle_tensor_feeds(1, tensor); - PaddleTensor tensor_out{.name = "prob_out", - .shape = std::vector({1000, 1}), - .data = PaddleBuf(), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor tensor_out; + tensor_out.name = "prob_out"; + tensor_out.shape = std::vector({}); + tensor_out.data = PaddleBuf(); + tensor_out.dtype = PaddleDType::FLOAT32; - std::vector outputs; - outputs.emplace_back(std::move(tensor_out)); + std::vector outputs(1, tensor_out); ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); float* data_o = static_cast(outputs[0].data.data()); - for (size_t j = 0; j < 1000; ++j) { + for (size_t j = 0; j < outputs[0].data.length(); ++j) { LOG(INFO) << "output[" << j << "]: " << data_o[j]; } } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 58fd7c6f8b05a846bd4a82068f09f5d9ef5a6516..08d7af6d3af7054061b15b904c69b2862c629562 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), inputs[i].data.data(), inputs[i].data.length()); + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. + framework::LoD lod; + for (auto &level : inputs[i].lod) { + lod.emplace_back(level); + } + input.set_lod(lod); + feeds->push_back(input); } return true; @@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch( buffer.Resize(sizeof(float) * data.size()); } std::memcpy(buffer.data(), data.data(), buffer.length()); + // copy LoD + for (const auto &level : fetchs[i].lod()) { + outputs->at(i).lod.emplace_back(level); + } outputs->at(i).dtype = PaddleDType::FLOAT32; // TODO(panyx0718): support other types? fill tensor name? avoid a copy. } diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index c0891e9c281961fa03d278a0f5c676f92672c419..45b5a7638b7dc6a54bbd905766fd5c284cb6aea1 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { void OptimizeInferenceProgram() { // Analyze inference_program Argument argument; + if (!config_.model_dir.empty()) { + argument.fluid_model_dir.reset(new std::string(config_.model_dir)); + } else { + PADDLE_ENFORCE( + !config_.param_file.empty(), + "Either model_dir or (param_file, prog_file) should be set."); + PADDLE_ENFORCE(!config_.prog_file.empty()); + argument.fluid_model_program_path.reset( + new std::string(config_.prog_file)); + argument.fluid_model_param_path.reset( + new std::string(config_.param_file)); + } argument.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); Singleton::Global().Run(&argument); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index 62d98a796708612e7d4ff8abfd85125978ce22c7..fcbf9b89d608e7961e3ef81ac1c70e083dae1cc0 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { std::vector data(20); for (int i = 0; i < 20; i++) data[i] = i; - PaddleTensor tensor{ - .name = "", - .shape = std::vector({10, 1}), - .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({10, 1}); + tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)); + tensor.dtype = PaddleDType::INT64; // For simplicity, we set all the slots with the same data. std::vector slots(4, tensor); diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc index 5f96fecf93f7a6c42bc6b9fe4e0d985c626388d7..03ac79e9edf0d7ce6e167c3d34af5ba84bbc0e72 100644 --- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc +++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc @@ -47,10 +47,10 @@ void Main(bool use_gpu) { //# 2. Prepare input. int64_t data[4] = {1, 2, 3, 4}; - PaddleTensor tensor{.name = "", - .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; // For simplicity, we set all the slots with the same data. std::vector slots(4, tensor); @@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) { for (int batch_id = 0; batch_id < num_batches; ++batch_id) { // 2. Dummy Input Data int64_t data[4] = {1, 2, 3, 4}; - PaddleTensor tensor{.name = "", - .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + std::vector inputs(4, tensor); std::vector outputs; // 3. Run diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index 0a2a2b713ab21a3124d8a85ba469f64278623ec4..3800d49b34738d5a272033d75cb415ae9ad1fb8f 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -20,8 +20,8 @@ limitations under the License. */ #include // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files. #include #include +#include "paddle/fluid/inference/demo_ci/utils.h" #include "paddle/fluid/platform/enforce.h" -#include "utils.h" #ifdef PADDLE_WITH_CUDA DECLARE_double(fraction_of_gpu_memory_to_use); @@ -123,11 +123,11 @@ void Main(bool use_gpu) { file.close(); // Inference. - PaddleTensor input{ - .name = "xx", - .shape = record.shape, - .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; VLOG(3) << "run executor"; std::vector output; diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 2f8b4f8596946988a728b5cf82de251bfda778a9..59b0df7968cce137843ba8cad38a62fdb8d3bfc1 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -44,7 +44,7 @@ class PaddleBuf { PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} // Own memory. - PaddleBuf(size_t length) + explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} // Resize to `length` bytes. void Resize(size_t length); @@ -67,9 +67,9 @@ struct PaddleTensor { PaddleTensor() = default; std::string name; // variable name. std::vector shape; - // TODO(Superjomn) for LoD support, add a vector> field if needed. PaddleBuf data; // blob of data. PaddleDType dtype; + std::vector> lod; // lod data }; enum class PaddleEngineKind { @@ -126,9 +126,11 @@ struct NativeConfig : public PaddlePredictor::Config { // Configurations for Anakin engine. struct AnakinConfig : public PaddlePredictor::Config { + enum TargetType { NVGPU = 0, X86 }; int device; std::string model_file; int max_batch_size{-1}; + TargetType target_type; }; struct TensorRTConfig : public NativeConfig { diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 3864f337bdadc61e7531304e2cf2ee52a25253f2..d86c046f2e5b08a4c00cf6cad19627e6a196c798 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,6 +1,6 @@ # Add TRT tests nv_library(tensorrt_converter - SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc + SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc DEPS tensorrt_engine operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -13,6 +13,10 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL) - +nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL) nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL) + +nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 8e7e23377d4b2fe7afd51f1f58048fc4ed3c6d99..dba1d50b2d1c487ced8e6ca51f2d257641ad5fc7 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -20,11 +20,60 @@ namespace tensorrt { class Conv2dOpConverter : public OpConverter { public: - Conv2dOpConverter() {} void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { LOG(INFO) << "convert a fluid conv2d op to tensorrt conv layer without bias"; + + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1); + + auto* X = engine_->GetITensor(op_desc.Input("Input").front()); + // Declare weights + auto* Y_v = scope.FindVar(op_desc.Input("Filter").front()); + PADDLE_ENFORCE_NOT_NULL(Y_v); + auto* Y_t = Y_v->GetMutable(); + auto* weight_data = Y_t->mutable_data(platform::CPUPlace()); + + PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL); + const int n_output = Y_t->dims()[0]; + const int filter_h = Y_t->dims()[2]; + const int filter_w = Y_t->dims()[3]; + + const int groups = boost::get(op_desc.GetAttr("groups")); + const std::vector dilations = + boost::get>(op_desc.GetAttr("dilations")); + const std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + const std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + + nvinfer1::DimsHW nv_ksize(filter_h, filter_w); + nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]); + nvinfer1::DimsHW nv_strides(strides[0], strides[1]); + nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); + + TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + Y_t->memory_size() / sizeof(float)}; + + TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + auto* layer = TRT_ENGINE_ADD_LAYER( + engine_, Convolution, *const_cast(X), n_output, + nv_ksize, weight.get(), bias.get()); + PADDLE_ENFORCE(layer != nullptr); + layer->setStride(nv_strides); + layer->setPadding(nv_paddings); + layer->setDilation(nv_dilations); + layer->setNbGroups(groups); + + auto output_name = op_desc.Output("Output").front(); + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } } }; diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3744550f60a1696aedd8a3ecd24f1b21d22325b9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -0,0 +1,210 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class ElementwiseWeightOpConverter : public OpConverter { + public: + ElementwiseWeightOpConverter() {} + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + // Here the two nullptr looks strange, that's because the + // framework::OpDesc's constructor is strange. + framework::OpDesc op_desc(op, nullptr); + LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer"; + + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto* X = engine_->GetITensor(op_desc.Input("X").front()); + nvinfer1::Dims dims_x = X->getDimensions(); + PADDLE_ENFORCE(dims_x.nbDims >= 3); + + auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); + PADDLE_ENFORCE_NOT_NULL(Y_v); + auto* Y_t = Y_v->GetMutable(); + auto* weight_data = Y_t->mutable_data(platform::CPUPlace()); + auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; + + std::vector dims_y = framework::vectorize2int(Y_t->dims()); + if (static_cast(dims_y.size()) == dims_x.nbDims + 1) { + if (dims_y[0] == 1) dims_y.erase(dims_y.begin()); + } + + if (static_cast(dims_y.size()) == 1 && dims_y[0] == dims_x.d[0]) { + scale_mode = nvinfer1::ScaleMode::kCHANNEL; + } else if (static_cast(dims_y.size()) == dims_x.nbDims && + dims_y[0] == dims_x.d[0]) { + scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; + for (int i = 1; i < dims_x.nbDims; i++) { + if (dims_y[i] != dims_x.d[i]) { + scale_mode = nvinfer1::ScaleMode::kCHANNEL; + break; + } + } + if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) { + for (int i = 1; i < dims_x.nbDims; i++) { + if (dims_y[i] != 1) + PADDLE_THROW( + "TensorRT unsupported weight shape for Elementwise op!"); + } + } + } else { + PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!"); + } + + TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + Y_t->memory_size() / sizeof(float)}; + TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + + nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *const_cast(X), scale_mode, + shift_weights.get(), scale_weights.get(), power_weights.get()); + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } + } +}; + +class ElementwiseTensorOpConverter : public OpConverter { + public: + ElementwiseTensorOpConverter() {} + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + // Here the two nullptr looks strange, that's because the + // framework::OpDesc's constructor is strange. + framework::OpDesc op_desc(op, nullptr); + LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer"; + + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto* X = engine_->GetITensor(op_desc.Input("X").front()); + auto* Y = engine_->GetITensor(op_desc.Input("Y").front()); + nvinfer1::Dims dims_x = X->getDimensions(); + nvinfer1::Dims dims_y = Y->getDimensions(); + + // The two input tensor should have the same dims + PADDLE_ENFORCE(dims_x.nbDims >= 3); + if (dims_x.nbDims == dims_y.nbDims) { + for (int i = 0; i < dims_x.nbDims; i++) { + if (dims_x.d[i] != dims_y.d[i]) + PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!"); + } + } else { + PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!"); + } + + auto op_pair = ops.find(op_type_); + if (op_pair == ops.end()) { + PADDLE_THROW("Wrong elementwise op type!"); + } + nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *const_cast(X), + *const_cast(Y), op_pair->second); + + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } + } + + protected: + static const std::unordered_map + ops; + std::string op_type_; +}; + +const std::unordered_map + ElementwiseTensorOpConverter::ops = { + {"add", nvinfer1::ElementWiseOperation::kSUM}, + {"mul", nvinfer1::ElementWiseOperation::kPROD}, + {"sub", nvinfer1::ElementWiseOperation::kSUB}, + {"div", nvinfer1::ElementWiseOperation::kDIV}, + {"min", nvinfer1::ElementWiseOperation::kMIN}, + {"pow", nvinfer1::ElementWiseOperation::kPOW}, + {"max", nvinfer1::ElementWiseOperation::kMAX}, +}; + +class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorAddOpConverter() { op_type_ = "add"; } +}; + +class ElementwiseTensorMulOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorMulOpConverter() { op_type_ = "mul"; } +}; + +class ElementwiseTensorSubOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorSubOpConverter() { op_type_ = "sub"; } +}; + +class ElementwiseTensorDivOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorDivOpConverter() { op_type_ = "div"; } +}; + +class ElementwiseTensorMinOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorMinOpConverter() { op_type_ = "min"; } +}; + +class ElementwiseTensorMaxOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorMaxOpConverter() { op_type_ = "max"; } +}; + +class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorPowOpConverter() { op_type_ = "pow"; } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter); + +REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor, + ElementwiseTensorAddOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_sub_tensor, + ElementwiseTensorSubOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_div_tensor, + ElementwiseTensorDivOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_mul_tensor, + ElementwiseTensorMulOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_max_tensor, + ElementwiseTensorMaxOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor, + ElementwiseTensorMinOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor, + ElementwiseTensorPowOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 409efac6799b6fb8d27a1343a55e7a508760868f..39fe1f609d7b94638506877fc301f19ef33ec8ac 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, } // indata c * k // Reorder the data layout from CK to KC. -void ReorderCKtoKC(TensorRTEngine::Weight& iweights, +void ReorderCKtoKC(TensorRTEngine::Weight& iweights, // NOLINT TensorRTEngine::Weight* oweights) { int c = iweights.dims[0]; int k = iweights.dims[1]; diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 968f7eb99ce8519edaa585fd3cb642bd80cc63cc..1b6a0ad82f3ceb00cec15c28c8121adc22271b7a 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -55,6 +55,31 @@ class OpConverter { it = Registry::Lookup("fc"); } } + + if (op_desc.Type().find("elementwise") != std::string::npos) { + static std::unordered_set add_tensor_op_set{ + "add", "mul", "sub", "div", "max", "min", "pow"}; + // TODO(xingzhaolong): all mul, sub, div + // static std::unordered_set add_weight_op_set {"add", "mul", + // "sub", "div"}; + static std::unordered_set add_weight_op_set{"add"}; + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); + int op_type_len = op_desc.Type().size(); + std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len); + std::string Y = op_desc.Input("Y")[0]; + if (parameters.count(Y)) { + PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0, + "Unsupported elementwise type" + op_type); + it = + Registry::Lookup("elementwise_" + op_type + "_weight"); + } else { + PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0, + "Unsupported elementwise type" + op_type); + it = + Registry::Lookup("elementwise_" + op_type + "_tensor"); + } + } + if (!it) { it = Registry::Lookup(op_desc.Type()); } diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f8711c6b60d74639529624c25429bc245de46479 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(conv2d_op, test) { + std::unordered_set parameters({"conv2d-Y"}); + framework::Scope scope; + TRTConvertValidation validator(5, parameters, scope, 1 << 15); + + validator.DeclInputVar("conv2d-X", nvinfer1::Dims3(2, 5, 5)); + validator.DeclParamVar("conv2d-Y", nvinfer1::Dims4(3, 2, 3, 3)); + validator.DeclOutputVar("conv2d-Out", nvinfer1::Dims3(3, 5, 5)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("conv2d"); + desc.SetInput("Input", {"conv2d-X"}); + desc.SetInput("Filter", {"conv2d-Y"}); + desc.SetOutput("Output", {"conv2d-Out"}); + + const std::vector strides({1, 1}); + const std::vector paddings({1, 1}); + const std::vector dilations({1, 1}); + const int groups = 1; + + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + desc.SetAttr("dilations", dilations); + desc.SetAttr("groups", groups); + + validator.SetOp(*desc.Proto()); + + validator.Execute(3); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle +USE_OP(conv2d); diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7537d02a35b66a41c158cd8eb1b1e5d4107e7d84 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(elementwise_op, add_weight_test) { + std::unordered_set parameters({"elementwise_add-Y"}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1 << 15); + validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); + validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1)); + // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); + validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("elementwise_add"); + desc.SetInput("X", {"elementwise_add-X"}); + desc.SetInput("Y", {"elementwise_add-Y"}); + desc.SetOutput("Out", {"elementwise_add-Out"}); + + int axis = 1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + + validator.Execute(8); +} + +TEST(elementwise_op, add_tensor_test) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(8, parameters, scope, 1 << 15); + validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); + validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3)); + // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); + validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("elementwise_add"); + desc.SetInput("X", {"elementwise_add-X"}); + desc.SetInput("Y", {"elementwise_add-Y"}); + desc.SetOutput("Out", {"elementwise_add-Out"}); + + // the defalut axis of elementwise op is -1 + + validator.SetOp(*desc.Proto()); + + validator.Execute(8); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle +USE_OP(elementwise_add); diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index 9b79f86b0edba983019bd932f52b08711ff36d41..d6651a5b244ba31a01220e6299cb2016ae61fe64 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) { framework::ProgramDesc prog; auto* block = prog.MutableBlock(0); auto* conv2d_op = block->AppendOp(); + + // init trt engine + cudaStream_t stream_; + std::unique_ptr engine_; + engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_)); + engine_->InitNetwork(); + PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + + engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3(2, 5, 5)); + conv2d_op->SetType("conv2d"); + conv2d_op->SetInput("Input", {"conv2d-X"}); + conv2d_op->SetInput("Filter", {"conv2d-Y"}); + conv2d_op->SetOutput("Output", {"conv2d-Out"}); - OpConverter converter; + const std::vector strides({1, 1}); + const std::vector paddings({1, 1}); + const std::vector dilations({1, 1}); + const int groups = 1; + + conv2d_op->SetAttr("strides", strides); + conv2d_op->SetAttr("paddings", paddings); + conv2d_op->SetAttr("dilations", dilations); + conv2d_op->SetAttr("groups", groups); + + // init scope framework::Scope scope; - converter.ConvertBlock(*block->Proto(), {}, scope, - nullptr /*TensorRTEngine*/); + std::vector dim_vec = {3, 2, 3, 3}; + auto* x = scope.Var("conv2d-Y"); + auto* x_tensor = x->GetMutable(); + x_tensor->Resize(framework::make_ddim(dim_vec)); + + OpConverter converter; + converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope, + engine_.get() /*TensorRTEngine*/); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 39529cc2c799212f91107b1b86dd2c8c3642b6da..63c2f978f253df11100ecca83acae5eab6a0337d 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -149,7 +149,7 @@ class TRTConvertValidation { cudaStreamSynchronize(*engine_->stream()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); - const size_t output_space_size = 2000; + const size_t output_space_size = 3000; for (const auto& output : op_desc_->OutputArgumentNames()) { std::vector fluid_out; std::vector trt_out(output_space_size); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index b3781ded01c09edd59df09fd064b37052ad0333a..22cbf680c0670552fb014043c69fcadc56863529 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -20,10 +20,10 @@ limitations under the License. */ #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" -DEFINE_bool(cudnn_deterministic, true, +DEFINE_bool(cudnn_deterministic, false, "Whether allow using an autotuning algorithm for convolution " "operator. The autotuning algorithm may be non-deterministic. If " - "false, the algorithm is deterministic."); + "true, the algorithm is deterministic."); namespace paddle { namespace operators { @@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); if (input_grad) { - if (FLAGS_cudnn_deterministic) { + if (!FLAGS_cudnn_deterministic) { CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, @@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } if (filter_grad) { - if (FLAGS_cudnn_deterministic) { + if (!FLAGS_cudnn_deterministic) { CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_input_desc, cudnn_output_grad_desc, diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 5098bd8700e11c9a2faeba90c38ed2d9499b17cf..0511eb42a073ac305634110a71a35e501f062132 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireSrcMemoryFromWeightsPrimitive( const std::shared_ptr user_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto src_pd = conv_bwd_weights_pd_->src_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc(); return this->AcquireMemory(src_pd, user_pd, user_memory_p, @@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireDiffDstMemoryFromWeightsPrimitive( const std::shared_ptr user_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc(); return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, @@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireDiffDstMemoryFromDataPrimitive( const std::shared_ptr user_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc(); return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, @@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromDataPrimitive( const std::shared_ptr user_weights_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc(); auto user_pd = user_weights_memory_p->get_primitive_desc(); return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p, @@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireSrcMemoryFromPrimitive( const std::shared_ptr user_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto src_pd = conv_pd_->src_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc(); return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", @@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc(); return this->AcquireMemory(weights_pd, user_weights_pd, @@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { // Generate keys for storing/retriving primitives for this operator // TODO(jczaja): Make hashing function more optimial - static std::string GetHash(memory::dims& input_dims, - memory::dims& weights_dims, - std::vector& strides, - std::vector& paddings, - std::vector& dilations, int groups, - const std::string& suffix) { + static std::string GetHash(memory::dims& input_dims, // NOLINT + memory::dims& weights_dims, // NOLINT + std::vector& strides, // NOLINT + std::vector& paddings, // NOLINT + std::vector& dilations, // NOLINT + int groups, const std::string& suffix) { return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) + dims2str(paddings) + dims2str(dilations) + std::to_string(groups) + suffix; diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc index 1a5427b39241b666eeaf12b173ea00443bb5f6e4..c86cd57316078778e5930c9b524b931d523028d7 100644 --- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc @@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { int axis = ctx.Attr("axis"); auto x_dims = x->dims(); - auto y_dims = y->dims(); + auto y_dims_untrimed = y->dims(); auto z_dims = z->dims(); // Execute default elementwise_add operator when // broadcast operations need to performed. - if (x_dims != y_dims) { + if (x_dims != y_dims_untrimed) { auto sum_func = [](T a, T b) -> T { return a + b; }; TransformFunctor { ctx.template device_context(), sum_func); - axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis); PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), "Axis should be in range [0, x_dims)"); - trim_trailing_singular_dims(&y_dims); + auto y_dims = trim_trailing_singular_dims(y_dims_untrimed); axis = (y_dims.size() == 0) ? x_dims.size() : axis; int pre, n, post; @@ -88,7 +88,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { "Wrong layout/format set for Y tensor"); std::vector src_x_tz = framework::vectorize2int(x_dims); - std::vector src_y_tz = framework::vectorize2int(y_dims); + std::vector src_y_tz = framework::vectorize2int(y_dims_untrimed); std::vector dst_tz = framework::vectorize2int(z_dims); std::vector srcs_pd; @@ -142,36 +142,39 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { using Tensor = framework::Tensor; - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); + // skip out, x, y, + // dout length is larger or equal than dx, dy. + auto* out = dout; + auto *x = dout, *y = dout; auto set_mkldnn_format = [](Tensor* in, const Tensor* out) { in->set_layout(DataLayout::kMKLDNN); in->set_format(out->format()); }; - if (x->dims() == y->dims()) { - auto blas = math::GetBlas(ctx); - if (dx) { - blas.VCOPY(dout->numel(), dout->data(), - dx->mutable_data(ctx.GetPlace())); - set_mkldnn_format(dx, dout); - } - - if (dy) { - blas.VCOPY(dout->numel(), dout->data(), - dy->mutable_data(ctx.GetPlace())); - set_mkldnn_format(dy, dout); + if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) { + if (dx->dims() == dy->dims()) { + auto blas = math::GetBlas(ctx); + if (dx) { + blas.VCOPY(dout->numel(), dout->data(), + dx->mutable_data(ctx.GetPlace())); + set_mkldnn_format(dx, dout); + } + + if (dy) { + blas.VCOPY(dout->numel(), dout->data(), + dy->mutable_data(ctx.GetPlace())); + set_mkldnn_format(dy, dout); + } } } else { // Execute default kernel when broadcast is needed - ElemwiseGradCompute, IdentityGrad>( + ElemwiseExplicitGradCompute, IdentityGrad>( ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad(), IdentityGrad()); } diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc index d2c20537136fc3ac9d1bece24a2238f26215c922..3c97ac995c649ecd0d196a584240e1e7ac04f08e 100644 --- a/paddle/fluid/operators/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise_add_op.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise_op.h" namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_add, "Add", "Out = X + Y"); +REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add); +REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out", + "X"); REGISTER_OP_CPU_KERNEL( elementwise_add, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h index baf04c30b17cb333fc8a6544afd6c479442f835b..5356105e2e551c0528694091608fc7585dce66d2 100644 --- a/paddle/fluid/operators/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise_add_op.h @@ -95,9 +95,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx, framework::Tensor* dy) { int axis = ctx.Attr("axis"); - ElemwiseGradCompute, IdentityGrad>( - ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad(), - IdentityGrad()); + ElemwiseExplicitGradCompute, + IdentityGrad>(ctx, *x, *y, *out, *dout, axis, + dx, dy, IdentityGrad(), + IdentityGrad()); } template @@ -140,14 +141,15 @@ class ElementwiseAddGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { using Tensor = framework::Tensor; - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + // skip out, x, y + auto* out = dout; + auto *x = dout, *y = dout; - if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) { + if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr && + dy != nullptr && (dx->dims() == dy->dims())) { elementwise_add_grad(ctx, x, y, out, dout, dx, dy); } else { default_elementwise_add_grad(ctx, x, y, out, dout, dx, diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc index 824b1221e5a77c8799dc34820b7f0db180c2439e..84c8a65e5f859d276ae6d5f1a3f25c9d713a7a61 100644 --- a/paddle/fluid/operators/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise_div_op.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise_div_op.h" #include "paddle/fluid/operators/elementwise_op.h" namespace ops = paddle::operators; + REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y"); + REGISTER_OP_CPU_KERNEL( elementwise_div, ops::ElementwiseDivKernel, diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index bb88970e42c194d9437609b62435f1a89e2b446b..d8a12e800ad733800c1ec333f15d31d4dcd1a3a5 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { void Make() final { AddInput("X", "(Tensor), The first input tensor of elementwise op."); AddInput("Y", "(Tensor), The second input tensor of elementwise op."); - AddOutput("Out", "The output of elementwise op.").Reuse("X"); + // AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save + // memory.").AsIntermediate(); + AddOutput("Out", "The output of elementwise op."); AddAttr("axis", "(int, default -1). The start dimension index " "for broadcasting Y onto X.") @@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$. )DOC", GetName(), GetEquation())); + SetReuse(); } protected: virtual std::string GetName() const = 0; virtual std::string GetEquation() const = 0; + virtual void SetReuse() {} }; class ElementwiseOpGrad : public framework::OperatorWithKernel { @@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); + auto input_data_type = framework::ToDataType( + ctx.Input(framework::GradVarName("Out"))->type()); #ifdef PADDLE_WITH_MKLDNN if (platform::CanMKLDNNBeUsed(ctx)) { @@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; + +// For Add, Sub op, the X, Out is not needed. +class ElementwiseOpExplicitGrad : public ElementwiseOpGrad { + public: + using operators::ElementwiseOpGrad::ElementwiseOpGrad; + using operators::ElementwiseOpGrad::GetExpectedKernelType; + using Tensor = framework::Tensor; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(x_grad_name, out_dims); + } + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(y_grad_name)) { + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + auto y_dims = ctx->GetInputDim("Y"); + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + } // namespace operators } // namespace paddle +/* +*/ + +#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name) \ + class kernel_type##GradMaker \ + : public paddle::framework::SingleGradOpDescMaker { \ + public: \ + using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \ + \ + protected: \ + std::unique_ptr Apply() const override { \ + auto* op = new paddle::framework::OpDesc(); \ + op->SetType(#kernel_type "_grad"); \ + op->SetInput("Y", Input("Y")); \ + op->SetInput(::paddle::framework::GradVarName("Out"), \ + OutputGrad("Out")); \ + op->SetAttrMap(Attrs()); \ + op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X")); \ + op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y")); \ + return std::unique_ptr<::paddle::framework::OpDesc>(op); \ + } \ + } + #define REGISTER_ELEMWISE_OP(op_type, op_name, equation) \ class __ElemwiseOp##op_type##Maker__ \ : public ::paddle::operators::ElementwiseOpMaker { \ @@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ::paddle::operators::ElementwiseOpInferVarType, \ ::paddle::framework::DefaultGradOpDescMaker); \ REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad) + +#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \ + class __ElemwiseOp##op_type##Maker__ \ + : public ::paddle::operators::ElementwiseOpMaker { \ + protected: \ + virtual std::string GetName() const { return op_name; } \ + virtual std::string GetEquation() const { return equation; } \ + virtual void SetReuse() { Reuse(__VA_ARGS__); } \ + }; \ + REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \ + __ElemwiseOp##op_type##Maker__, \ + ::paddle::operators::ElementwiseOpInferVarType, \ + op_type##GradMaker); \ + REGISTER_OPERATOR(op_type##_grad, \ + ::paddle::operators::ElementwiseOpExplicitGrad) diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index 8b052611f80ddf874ca48c1c58e13346528a834e..eb8272e90c32c3a0be2c0ce1bc679571af876317 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims, } } -inline void trim_trailing_singular_dims(framework::DDim* dims) { +inline framework::DDim trim_trailing_singular_dims( + const framework::DDim& dims) { // Remove trailing dimensions of size 1 for y - auto actual_dims_size = dims->size(); + auto actual_dims_size = dims.size(); for (; actual_dims_size != 0; --actual_dims_size) { - if ((*dims)[actual_dims_size - 1] != 1) break; + if (dims[actual_dims_size - 1] != 1) break; } - if (actual_dims_size != dims->size()) { - auto actual_dims = framework::vectorize(*dims); - actual_dims.resize(actual_dims_size); - *dims = framework::make_ddim(actual_dims); + + std::vector trim_dims; + trim_dims.resize(actual_dims_size); + for (int i = 0; i < actual_dims_size; ++i) { + trim_dims[i] = dims[i]; } + framework::DDim actual_dims = framework::make_ddim(trim_dims); + return actual_dims; } template @@ -456,6 +462,71 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x, #endif +template +void ElemwiseGradComputeNoBroadcast( + const framework::ExecutionContext& ctx, const framework::DDim& x_dim, + const framework::DDim& y_dim, const framework::Tensor& x, + const framework::Tensor& y, const framework::Tensor& out, + const framework::Tensor& dout, int axis, framework::Tensor* dx, + framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) { + size_t N = static_cast(framework::product(x_dim)); + platform::ForRange for_range( + ctx.template device_context(), N); + for_range(ElemwiseGradNoBroadcast{ + x.data(), y.data(), out.data(), dout.data(), dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())}); +} + +template +void ElemwiseGradComputeWithBroadcast( + const framework::ExecutionContext& ctx, const framework::DDim& x_dim, + const framework::DDim& y_dim_untrimed, const framework::Tensor& x, + const framework::Tensor& y, const framework::Tensor& out, + const framework::Tensor& dout, int axis, framework::Tensor* dx, + framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) { + axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis); + auto y_dim = trim_trailing_singular_dims(y_dim_untrimed); + axis = (y_dim.size() == 0) ? x_dim.size() : axis; + + int pre, n, post; + get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); + if (post == 1) { + int h = pre; + int w = n; + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef __NVCC__ + ElemwiseGradBroadcast1CUDA( + ctx.template device_context().stream(), x.data(), + y.data(), out.data(), dout.data(), h, w, dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); +#endif + } else { + ElemwiseGradBroadcast1CPU( + x.data(), y.data(), out.data(), dout.data(), h, w, dx_op, + dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + } + } else { + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef __NVCC__ + ElemwiseGradBroadcast2CUDA( + ctx.template device_context().stream(), x.data(), + y.data(), out.data(), dout.data(), pre, n, post, dx_op, + dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); +#endif + } else { + ElemwiseGradBroadcast2CPU( + x.data(), y.data(), out.data(), dout.data(), pre, n, post, + dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + } + } +} + template void ElemwiseGradCompute(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& y, @@ -463,63 +534,50 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, const framework::Tensor& dout, int axis, framework::Tensor* dx, framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) { + const framework::DDim x_dim = x.dims(); + const framework::DDim y_dim = y.dims(); if (x.dims() == y.dims()) { - size_t N = static_cast(framework::product(x.dims())); - platform::ForRange for_range( - ctx.template device_context(), N); - for_range(ElemwiseGradNoBroadcast{ - x.data(), y.data(), out.data(), dout.data(), dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())}); + ElemwiseGradComputeNoBroadcast( + ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } else { // Y is a scalar - auto x_dim = x.dims(); - auto y_dim = y.dims(); - - axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis); - trim_trailing_singular_dims(&y_dim); - axis = (y_dim.size() == 0) ? x_dim.size() : axis; - - int pre, n, post; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); - if (post == 1) { - int h = pre; - int w = n; - if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ - ElemwiseGradBroadcast1CUDA( - ctx.template device_context().stream(), x.data(), - y.data(), out.data(), dout.data(), h, w, dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); -#endif - } else { - ElemwiseGradBroadcast1CPU( - x.data(), y.data(), out.data(), dout.data(), h, w, - dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); - } - } else { - if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ - ElemwiseGradBroadcast2CUDA( - ctx.template device_context().stream(), x.data(), - y.data(), out.data(), dout.data(), pre, n, post, dx_op, - dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); -#endif - } else { - ElemwiseGradBroadcast2CPU( - x.data(), y.data(), out.data(), dout.data(), pre, n, - post, dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); - } + ElemwiseGradComputeWithBroadcast( + ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + } +} + +// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub. +// explicit gradient can cut off X, Y, Out from gradient op +// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse +// elementwise code. +template +void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx, + const framework::Tensor& x, + const framework::Tensor& y, + const framework::Tensor& out, + const framework::Tensor& dout, int axis, + framework::Tensor* dx, framework::Tensor* dy, + DX_OP dx_op, DY_OP dy_op) { + if (dy == nullptr) { + const framework::DDim dx_dims = dout.dims(); + auto dy_dims = dx_dims; + ElemwiseGradComputeNoBroadcast( + ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + } else { + if (dout.dims() == dy->dims()) { + const framework::DDim dx_dims = dout.dims(); + const framework::DDim dy_dims = dy->dims(); + ElemwiseGradComputeNoBroadcast( + ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + } else { // Y is a scalar + auto dx_dims = dout.dims(); + const framework::DDim dy_dims = dy->dims(); + ElemwiseGradComputeWithBroadcast( + ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } } } +// Deprecated template void ElementwiseGradCompute(const framework::ExecutionContext& ctx, @@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx, } axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); - trim_trailing_singular_dims(&y_dims); + trim_trailing_singular_dims(y_dims); axis = (y_dims.size() == 0) ? x_dims.size() : axis; int pre, n, post; @@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx, x, y, z, ctx.template device_context(), func); auto x_dims = x->dims(); - auto y_dims = y->dims(); - PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + auto y_dims_untrimed = y->dims(); + PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(), "Rank of first input must >= rank of second input."); - if (x_dims == y_dims) { + if (x_dims == y_dims_untrimed) { functor.Run(); return; } - axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis); PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), "Axis should be in range [0, x_dims)"); - trim_trailing_singular_dims(&y_dims); + auto y_dims = trim_trailing_singular_dims(y_dims_untrimed); axis = (y_dims.size() == 0) ? x_dims.size() : axis; int pre, n, post; diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc index a7562b166b373ee2a8c9b6f379431d88d3e45fcb..b7224261e6a7ca82dff92a25f5fe8818c08e676d 100644 --- a/paddle/fluid/operators/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise_sub_op.cc @@ -15,7 +15,10 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise_op.h" namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_sub, "Sub", "Out = X - Y"); +REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub); +REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out", + "X"); + REGISTER_OP_CPU_KERNEL( elementwise_sub, ops::ElementwiseSubKernel, diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h index fe088b8203722a43b9aba7be3878b8f4ca68ba12..11c7e3fe628001f095836a788f2bcc7c4ee7ad4b 100644 --- a/paddle/fluid/operators/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise_sub_op.h @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { using Tensor = framework::Tensor; - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElemwiseGradCompute, SubGradDY>( + // skip out, x, y + auto* out = dout; + auto *x = dout, *y = dout; + + ElemwiseExplicitGradCompute, SubGradDY>( ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX(), SubGradDY()); } }; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 438b44b42aaf4c7e3ff05a5f7c52bbfd850e92c7..e14b148cc00f425e90b0b2256ab3462753a34f47 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -19,12 +19,17 @@ limitations under the License. */ #include // NOLINT #include +#include "gflags/gflags.h" + #include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/platform/profiler.h" +DEFINE_int32(listen_and_serv_profile_period, 0, + "the period of listen_and_serv to do profile"); + namespace paddle { namespace operators { @@ -122,7 +127,18 @@ void ListenAndServOp::RunSyncLoop( std::shared_ptr(nullptr)); rpc_service_->ResetBarrierCounter(); + + int32_t profile_step = 0; while (true) { + PADDLE_ENFORCE_LE(profile_step, FLAGS_listen_and_serv_profile_period, + "profile_step should not be larger then " + "FLAGS_listen_and_serv_profile_period"); + if (FLAGS_listen_and_serv_profile_period > 0) { + if (profile_step == 0) { + auto pf_state = paddle::platform::ProfilerState::kCPU; + paddle::platform::EnableProfiler(pf_state); + } + } // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. rpc_service_->SetCond(distributed::kRequestSend); @@ -164,6 +180,15 @@ void ListenAndServOp::RunSyncLoop( // reset received sparse vars to avoid reuse it in the next mini-batch dynamic_cast(request_send_handler_.get()) ->ResetSparseVarRecorder(); + if (FLAGS_listen_and_serv_profile_period > 0) { + if (profile_step == FLAGS_listen_and_serv_profile_period) { + paddle::platform::DisableProfiler( + paddle::platform::EventSortingKey::kTotal, "/dev/null"); + profile_step = 0; + } else { + profile_step++; + } + } } // while(true) } diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index bb55ce21b0599bcff4db138a46c9c700f6e52422..1472edbbf47e3e4d6b22c65349713904b13647d2 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include +#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" namespace paddle { namespace operators { @@ -35,61 +36,18 @@ class Im2ColFunctordims().size() == 5); - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int output_height = col->dims()[3]; - int output_width = col->dims()[4]; - - int channels_col = im_channels * filter_height * filter_width; - - const T* im_data = im.data(); - T* col_data = col->data(); - // TODO(TJ): change me to template - // further optimaze: - // 1. padding != 1 - // 2. could also support stride_h != 1 if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 && - dilation[1] == 1 && padding[0] == 0 && padding[1] == 0) { - int col_matrix_width = output_width * output_height; - size_t copy_size = sizeof(T) * output_width; - for (int oh = 0; oh < output_height; ++oh) { - const T* im_data_start = im_data + oh * im_width; - T* dst_data = col_data + oh * output_width; - for (int ic = 0; ic < im_channels; ++ic) { - const T* src_data = im_data_start + ic * im_height * im_width; - for (int kh = 0; kh < filter_height; ++kh) { - for (int kw = 0; kw < filter_width; ++kw) { - std::memcpy(dst_data, src_data + kw, copy_size); - dst_data = dst_data + col_matrix_width; - } - src_data = src_data + im_width; - } - } - } - return; - } - - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < output_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < output_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * output_height + h) * output_width + w; - int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? static_cast(0) - : im_data[im_idx]; - } + dilation[1] == 1) { + if (padding[0] == 0 && padding[1] == 0) { + im2col_sh1sw1dh1dw1ph0pw0(im, col); + return; + } else if (padding[0] == 1 && padding[1] == 1) { + im2col_sh1sw1dh1dw1ph1pw1(im, col); + return; } + // TODO(TJ): complete padding >=2 } + im2col_common(im, dilation, stride, padding, col); } }; diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..0d32bc5bd0d7f25479370959cabeb9b9c9e7e2d6 --- /dev/null +++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h @@ -0,0 +1,252 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +/** + * The most common im2col algorithm. + * Support dilation, stride and padding. + */ +template +inline void im2col_common(const framework::Tensor& im, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + int channels_col = im_channels * filter_height * filter_width; + + const T* im_data = im.data(); + T* col_data = col->data(); + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / (filter_width * filter_height); + for (int h = 0; h < output_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + for (int w = 0; w < output_width; ++w) { + int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + int col_idx = (c * output_height + h) * output_width + w; + int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; + col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || + im_col_idx < 0 || im_col_idx >= im_width) + ? static_cast(0) + : im_data[im_idx]; + } + } + } +} + +/** + * im2col algorithm with strides == 1, dilations == 1, paddings == 0 + */ +template +inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + + const T* im_data = im.data(); + T* col_data = col->data(); + int col_matrix_width = output_width * output_height; + int im_size = im_height * im_width; + size_t copy_size = sizeof(T) * output_width; + const T* im_data_oh = im_data; + T* dst_data_oh = col_data; + for (int oh = 0; oh < output_height; ++oh) { + const T* src_data_ic = im_data_oh; + T* dst_data = dst_data_oh; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = src_data_ic; + for (int kh = 0; kh < filter_height; ++kh) { + for (int kw = 0; kw < filter_width; ++kw) { + std::memcpy(dst_data, src_data + kw, copy_size); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + src_data_ic = src_data_ic + im_size; + } + im_data_oh = im_data_oh + im_width; + dst_data_oh = dst_data_oh + output_width; + } +} + +/** + * im2col algorithm with strides == 1, dilations == 1, paddings == 1 + * and filter_width == 1 have a special implementation + */ +template +inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + + constexpr int plh = 1; + constexpr int prh = 1; + constexpr int plw = 1; + constexpr int prw = 1; + + const T* im_data = im.data(); + T* col_data = col->data(); + int im_size = im_height * im_width; + int col_matrix_width = output_width * output_height; + int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow + int col_block_ic = filter_height * col_block_fh; // fh*fw*oh*ow + + // fill height padding + { + size_t copy_size = sizeof(T) * output_width; + T* col_start_l = col_data; + T* col_start_r = col_data + (filter_height - 1) * col_block_fh + + col_matrix_width - output_width; + for (int ic = 0; ic < im_channels; ++ic) { + T* dst_data_l = col_start_l; + T* dst_data_r = col_start_r; + for (int kw = 0; kw < filter_width; ++kw) { + std::memset(dst_data_l, 0, copy_size); + std::memset(dst_data_r, 0, copy_size); + dst_data_l = dst_data_l + col_matrix_width; + dst_data_r = dst_data_r + col_matrix_width; + } + col_start_l = col_start_l + col_block_ic; + col_start_r = col_start_r + col_block_ic; + } + } + + auto pad = static_cast(0); + if (filter_width == 1) { + // fill width padding + T* dst_data_ic = col_data; + for (int ic = 0; ic < im_channels; ++ic) { + T* dst_data_kh = dst_data_ic; + for (int kh = 0; kh < filter_height; ++kh) { + T* dst_data = dst_data_kh; + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width - 1; + *dst_data = pad; + ++dst_data; + } + dst_data_kh = dst_data_kh + col_block_fh; + } + dst_data_ic = dst_data_ic + col_block_ic; + } + // fill core + size_t copy_size = sizeof(T) * (output_width - plw - prw); + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = + im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && + kh > (filter_height - prh - 1))) { + dst_data = dst_data + col_matrix_width; + continue; + } + std::memcpy(dst_data + plw, src_data, copy_size); + dst_data = dst_data + col_matrix_width; + src_data = src_data + im_width; + } + } + } + return; + } + + // filter_width != 1 + // fill width padding + T* dst_data_ic = col_data; + for (int ic = 0; ic < im_channels; ++ic) { + T* dst_data_kh = dst_data_ic; + for (int kh = 0; kh < filter_height; ++kh) { + for (T* dst_data : + {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width + + output_width - 1}) { + // TODO(TJ): from plh, saving repeated assignment + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width; + } + } + dst_data_kh = dst_data_kh + col_block_fh; + } + dst_data_ic = dst_data_ic + col_block_ic; + } + + // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * + // (output_width-1)} + // length of copy_size is equal kw. + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && + kh > (filter_height - prh - 1))) { + dst_data = dst_data + filter_width * col_matrix_width; + continue; + } + // TODO(TJ): reuse plw-kw outside this for + // try to unify + for (int kw = 0; kw < plw; ++kw) { + std::memcpy(dst_data + (plw - kw), src_data, + sizeof(T) * (output_width - (plw - kw))); + dst_data = dst_data + col_matrix_width; + } + for (int kw = plw; kw < filter_width - prw; ++kw) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * output_width); + dst_data = dst_data + col_matrix_width; + } + int i = 1; + for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * (output_width - i)); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + } + } +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index db61f68db3e492d98cfa43576fa1900bffc8674d..ae2c90b33a4298ada4fd01aa2a44ebdf10d036d4 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include +#include #include +#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" template void testIm2col() { @@ -160,82 +162,111 @@ void testIm2col() { delete context; } -void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { - paddle::framework::Tensor input; - paddle::framework::Tensor output; - paddle::framework::Tensor ref_output; - std::vector padding({ph, pw}); - std::vector stride({1, 1}); // stride_y, stride_x - std::vector dilation({1, 1}); // dilation_y, dilation_x - int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; - int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; - float* input_ptr = - input.mutable_data({ic, ih, iw}, paddle::platform::CPUPlace()); - for (int i = 0; i < input.numel(); ++i) { - input_ptr[i] = static_cast(i + 1); - } - - paddle::platform::CPUPlace place; - paddle::platform::CPUDeviceContext context(place); - output.mutable_data({ic, fh, fw, output_height, output_width}, place); - ref_output.mutable_data({ic, fh, fw, output_height, output_width}, - place); - paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kCFO, - paddle::platform::CPUDeviceContext, float> - im2col; - im2col(context, input, dilation, stride, padding, &output); - auto ref_im2col = [&]( - const paddle::framework::Tensor& im, const std::vector& dilation, - const std::vector& stride, const std::vector& padding, - paddle::framework::Tensor* col) { - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int output_height = col->dims()[3]; - int output_width = col->dims()[4]; - int channels_col = im_channels * filter_height * filter_width; - - const float* im_data = im.data(); - float* col_data = col->data(); - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < output_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < output_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * output_height + h) * output_width + w; - int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? 0.f - : im_data[im_idx]; - } - } - } - }; - - ref_im2col(input, dilation, stride, padding, &ref_output); - - float* out_cfo_ptr = output.data(); - float* out_ref_ptr = ref_output.data(); - for (int i = 0; i < output.numel(); ++i) { - EXPECT_EQ(out_cfo_ptr[i], out_ref_ptr[i]); - } -} - TEST(math, im2col) { testIm2col(); - testIm2colCPU(/*ic*/ 3, /*ih*/ 5, /*iw*/ 5, /*fh*/ 3, /*fw*/ 2, /*ph*/ 0, - /*pw*/ 0); - testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1, - /*pw*/ 1); #ifdef PADDLE_WITH_CUDA testIm2col(); #endif } + +#define PREPARE_IM2COL_CPU \ + paddle::platform::CPUPlace place; \ + paddle::platform::CPUDeviceContext context(place); \ + paddle::framework::Tensor input; \ + paddle::framework::Tensor out; \ + paddle::framework::Tensor ref; \ + std::vector padding({ph, pw}); \ + std::vector stride({1, 1}); \ + std::vector dilation({1, 1}); \ + float* input_ptr = input.mutable_data({ic, ih, iw}, place); \ + for (int i = 0; i < input.numel(); ++i) { \ + input_ptr[i] = static_cast(i + 1); \ + } \ + int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; \ + int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; \ + out.mutable_data({ic, fh, fw, output_height, output_width}, place); \ + ref.mutable_data({ic, fh, fw, output_height, output_width}, place); \ + paddle::operators::math::Im2ColFunctor< \ + paddle::operators::math::ColFormat::kCFO, \ + paddle::platform::CPUDeviceContext, float> \ + im2col + +void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { + PREPARE_IM2COL_CPU; + + im2col(context, input, dilation, stride, padding, &out); + paddle::operators::math::im2col_common(input, dilation, stride, + padding, &ref); + + float* ref_data = ref.data(); + float* out_data = out.data(); + for (int i = 0; i < out.numel(); ++i) { + EXPECT_EQ(out_data[i], ref_data[i]); + } +} + +void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { + PREPARE_IM2COL_CPU; + constexpr int repeat = 100; + auto GetCurrentMs = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec; + }; + auto t1 = GetCurrentMs(); + for (int i = 0; i < repeat; ++i) { + im2col(context, input, dilation, stride, padding, &out); + } + auto t2 = GetCurrentMs(); + + for (int i = 0; i < repeat; ++i) { + paddle::operators::math::im2col_common(input, dilation, stride, + padding, &ref); + } + auto t3 = GetCurrentMs(); + + LOG(INFO) << "before: " << (t3 - t2) / repeat + << ",after: " << (t2 - t1) / repeat + << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%"; +} + +TEST(math, im2col_cputest) { + // padding_h == padding_w + for (int p = 0; p < 4; ++p) { + // width == height + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 5, /*fh*/ 4, /*fw*/ 4, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 2, /*ph*/ p, + /*pw*/ p); + + // height != width + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); + + // filter == 1 + testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); + } + + // padding_h != padding_w + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1, + /*pw*/ 2); + + // benchmark + for (int p : {0, 1}) { + for (int k : {1, 3, 5}) { + LOG(INFO) << "padding == " << p << ", filter == " << k; + benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k, + /*ph*/ p, /*pw*/ p); + } + } +} diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index a9fd1869c9df5464db6fc87ac633cdba2d6dbe7f..a1dfe39c3a4f84f5e4aaa2306813a7decf0e49ea 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor). The output tensor of reshape operator."); AddAttr>( "shape", "(std::vector) Target shape of reshape operator."); - AddAttr("inplace", - "(default: false) Change the source tensor's shape without " - "memory copy. When Attr(inplace) is set true, the output " - "tensor shares memory with Input(X), otherwise, a new output " - "tensor is created, and its data are copied from Input(x).") - .SetDefault(false); AddComment(R"DOC( Reshape Operator. @@ -233,16 +227,9 @@ class ReshapeKernel { "sequence_reshape op."); } - bool inplace = ctx.Attr("inplace"); + out->mutable_data(ctx.GetPlace(), in->type()); + framework::TensorCopySync(*in, ctx.GetPlace(), out); out->Resize(out_dims); - if (!inplace) { - out->mutable_data(ctx.GetPlace(), in->type()); - framework::TensorCopySync(*in, ctx.GetPlace(), out); - out->Resize(out_dims); - } else { - out->ShareDataWith(*in); - out->Resize(out_dims); - } } }; @@ -251,19 +238,11 @@ class ReshapeGradKernel { void operator()(const framework::ExecutionContext &ctx) const { auto *d_out = ctx.Input(framework::GradVarName("Out")); auto *d_x = ctx.Output(framework::GradVarName("X")); + auto in_dims = d_x->dims(); d_x->mutable_data(ctx.GetPlace(), d_out->type()); - bool inplace = ctx.Attr("inplace"); - - auto in_dims = d_x->dims(); - if (!inplace) { - framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x); - ctx.device_context().Wait(); - d_x->Resize(in_dims); - } else { - d_x->ShareDataWith(*d_out); - d_x->Resize(in_dims); - } + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + d_x->Resize(in_dims); } }; diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 31a7458f637921c290fc71ac748143867b4aae19..fefc7125b4de7274589670d29be4511469d5064a 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { ctx->GetInputDim(framework::GradVarName("Out")), "Input(Out) and its gradients should have a same shape."); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); } protected: @@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { layout_ = framework::DataLayout::kMKLDNN; } #endif - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); + auto input_data_type = framework::ToDataType( + ctx.Input(framework::GradVarName("Out"))->type()); if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "float16 can only be used on GPU place"); @@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { } }; +class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("softmax_grad"); + + op->SetInput("Out", Output("Out")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return std::unique_ptr(op); + } +}; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::SoftmaxOpGradMaker); REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); REGISTER_OP_CPU_KERNEL( softmax, ops::SoftmaxKernel, diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h index d263426e073d95ad6d584c7370baf596587a993d..c4af5a65fc5f81c1af7c1fdcca637ca37c940637 100644 --- a/paddle/fluid/operators/split_ids_op.h +++ b/paddle/fluid/operators/split_ids_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" @@ -67,10 +68,15 @@ class SplitIdsOpKernel : public framework::OpKernel { const auto &ids_rows = ids_selected_rows->rows(); auto outs = ctx.MultiOutput("Out"); const size_t shard_num = outs.size(); + for (auto &out : outs) { + out->mutable_rows()->clear(); + } // get rows for outputs - for (auto &id : ids_rows) { - size_t shard_id = static_cast(id) % shard_num; - outs[shard_id]->mutable_rows()->push_back(id); + std::unordered_map id_to_index; + for (size_t i = 0; i < ids_rows.size(); ++i) { + id_to_index[ids_rows[i]] = i; + size_t shard_id = static_cast(ids_rows[i]) % shard_num; + outs[shard_id]->mutable_rows()->push_back(ids_rows[i]); } int64_t row_width = ids_dims[1]; @@ -80,7 +86,8 @@ class SplitIdsOpKernel : public framework::OpKernel { {static_cast(out->rows().size()), row_width}); T *output = out->mutable_value()->mutable_data(ddim, place); for (int64_t i = 0; i < ddim[0]; ++i) { - memcpy(output + i * row_width, ids + out->rows()[i] * row_width, + memcpy(output + i * row_width, + ids + id_to_index[out->rows()[i]] * row_width, row_width * sizeof(T)); } } diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu index 4a47ba5ccad4de338844e60f6fcbd6b7c11e891b..ca5ca1caeb23f01c047feeccf9c276b2dcd1cb68 100644 --- a/paddle/fluid/platform/cuda_helper_test.cu +++ b/paddle/fluid/platform/cuda_helper_test.cu @@ -13,7 +13,6 @@ // limitations under the License. #include -#include #include #include @@ -25,13 +24,13 @@ using paddle::platform::PADDLE_CUDA_NUM_THREADS; using paddle::platform::float16; -#define CUDA_ATOMIC_KERNEL(op, T) \ - __global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \ - i += blockDim.x * gridDim.x) { \ - paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \ - } \ +template +__global__ void AddKernel(const T* data_a, T* data_b, size_t num) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]); } +} template struct AddFunctor { @@ -39,80 +38,116 @@ struct AddFunctor { }; template -struct SubFunctor { - T operator()(const T& a, const T& b) { return a - b; } -}; - -// NOTE(dzhwinter): the float16 add has small underflow/overflow -// so we use EXPECT_NEAR to check the result. -#define ARITHMETIC_KERNEL_LAUNCH(op, T) \ - void Test##T##op(size_t num) { \ - T *in1, *in2, *out; \ - T *d_in1, *d_in2; \ - size_t size = sizeof(T) * num; \ - cudaMalloc(reinterpret_cast(&d_in1), size); \ - cudaMalloc(reinterpret_cast(&d_in2), size); \ - in1 = reinterpret_cast(malloc(size)); \ - in2 = reinterpret_cast(malloc(size)); \ - out = reinterpret_cast(malloc(size)); \ - std::minstd_rand engine; \ - std::uniform_real_distribution dist(0.0, 1.0); \ - for (size_t i = 0; i < num; ++i) { \ - in1[i] = static_cast(dist(engine)); \ - in2[i] = static_cast(dist(engine)); \ - } \ - cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ - cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ - op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \ - cudaDeviceSynchronize(); \ - cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ - for (size_t i = 0; i < num; ++i) { \ - EXPECT_NEAR(static_cast(out[i]), \ - static_cast(op##Functor()(in1[i], in2[i])), \ - 0.001); \ - } \ - free(in1); \ - free(in2); \ - free(out); \ - cudaFree(d_in1); \ - cudaFree(d_in2); \ +void TestCase(size_t num) { + T *in1, *in2, *out; + T *d_in1, *d_in2; + size_t size = sizeof(T) * num; + cudaMalloc(reinterpret_cast(&d_in1), size); + cudaMalloc(reinterpret_cast(&d_in2), size); + in1 = reinterpret_cast(malloc(size)); + in2 = reinterpret_cast(malloc(size)); + out = reinterpret_cast(malloc(size)); + std::minstd_rand engine; + std::uniform_real_distribution dist(0.0, 1.0); + for (size_t i = 0; i < num; ++i) { + in1[i] = static_cast(dist(engine)); + in2[i] = static_cast(dist(engine)); } -CUDA_ATOMIC_KERNEL(Add, float); -CUDA_ATOMIC_KERNEL(Add, double); -CUDA_ATOMIC_KERNEL(Add, float16); - -ARITHMETIC_KERNEL_LAUNCH(Add, float); -ARITHMETIC_KERNEL_LAUNCH(Add, double); -ARITHMETIC_KERNEL_LAUNCH(Add, float16); - -namespace paddle { -namespace platform { -USE_CUDA_ATOMIC(Sub, int); -}; -}; -CUDA_ATOMIC_KERNEL(Sub, int); -ARITHMETIC_KERNEL_LAUNCH(Sub, int); + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); + AddKernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); + cudaDeviceSynchronize(); + cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + for (size_t i = 0; i < num; ++i) { + // NOTE(dzhwinter): the float16 add has small underflow/overflow + // so we use EXPECT_NEAR to check the result. + EXPECT_NEAR(static_cast(out[i]), + static_cast(AddFunctor()(in1[i], in2[i])), 0.001); + } + free(in1); + free(in2); + free(out); + cudaFree(d_in1); + cudaFree(d_in2); +} // cuda primitives TEST(CudaAtomic, Add) { - TestfloatAdd(static_cast(10)); - TestfloatAdd(static_cast(1024 * 1024)); - TestdoubleAdd(static_cast(10)); - TestdoubleAdd(static_cast(1024 * 1024)); -} + TestCase(static_cast(10)); + TestCase(static_cast(1024 * 1024)); -TEST(CudaAtomic, Sub) { - TestintSub(static_cast(10)); - TestintSub(static_cast(1024 * 1024)); + TestCase(static_cast(10)); + TestCase(static_cast(1024 * 1024)); } TEST(CudaAtomic, float16) { - using paddle::platform::float16; - Testfloat16Add(static_cast(1)); - Testfloat16Add(static_cast(2)); - Testfloat16Add(static_cast(3)); + TestCase(static_cast(1)); + TestCase(static_cast(2)); + TestCase(static_cast(3)); + + TestCase(static_cast(10)); + TestCase(static_cast(1024 * 1024)); +} + +// unalignment of uint8 +void TestUnalign(size_t num, const int shift_bit) { + PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2"); + float16 *in1, *in2, *out; + float16 *d_in1, *d_in2; + size_t size = sizeof(uint8_t) * (num + shift_bit); + size_t array_size = sizeof(float16) * (num / 2); + + cudaMalloc(reinterpret_cast(&d_in1), size); + cudaMalloc(reinterpret_cast(&d_in2), size); + in1 = reinterpret_cast(malloc(size)); + in2 = reinterpret_cast(malloc(size)); + out = reinterpret_cast(malloc(size)); + + // right shift 1, mimic the unalignment of address + float16* r_in1 = + reinterpret_cast(reinterpret_cast(in1) + shift_bit); + float16* r_in2 = + reinterpret_cast(reinterpret_cast(in2) + shift_bit); + + std::minstd_rand engine; + std::uniform_real_distribution dist(0.0, 1.0); + for (size_t i = 0; i < num / 2; ++i) { + r_in1[i] = static_cast(dist(engine)); + r_in2[i] = static_cast(dist(engine)); + } + cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice); + AddKernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2); + cudaDeviceSynchronize(); + cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + for (size_t i = 0; i < num / 2; ++i) { + // NOTE(dzhwinter): the float16 add has small underflow/overflow + // so we use EXPECT_NEAR to check the result. + EXPECT_NEAR(static_cast(out[i]), + static_cast(AddFunctor()(r_in1[i], r_in2[i])), + 0.001); + } + free(in1); + free(in2); + free(out); + cudaFree(d_in1); + cudaFree(d_in2); +} + +TEST(CudaAtomic, float16Unalign) { + // same with float16 testcase + TestUnalign(static_cast(2), /*shift_bit*/ 2); + TestUnalign(static_cast(1024), /*shift_bit*/ 2); + TestUnalign(static_cast(1024 * 1024), /*shift_bit*/ 2); + + // shift the address. + TestUnalign(static_cast(2), /*shift_bit*/ 1); + TestUnalign(static_cast(1024), /*shift_bit*/ 1); + TestUnalign(static_cast(1024 * 1024), /*shift_bit*/ 1); - Testfloat16Add(static_cast(10)); - Testfloat16Add(static_cast(1024 * 1024)); + TestUnalign(static_cast(2), /*shift_bit*/ 3); + TestUnalign(static_cast(1024), /*shift_bit*/ 3); + TestUnalign(static_cast(1024 * 1024), /*shift_bit*/ 3); } diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h index 94ce83975a7f13daa2b6a4d480cb22cc95811b9b..67ea64833d3b844d88a2e5996f860ef165bd8ffd 100644 --- a/paddle/fluid/platform/cuda_primitives.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) { // convert the value into float and do the add arithmetic. // then store the result into a uint32. -inline __device__ uint32_t add_to_low_half(uint32_t val, float x) { +inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) { float16 low_half; // the float16 in lower 16bits - low_half.x = static_cast(val & 0xffffu); + low_half.x = static_cast(val & 0xFFFFu); low_half = static_cast(static_cast(low_half) + x); - return (val & 0xffff0000u) | low_half.x; + return (val & 0xFFFF0000u) | low_half.x; } -inline __device__ uint32_t add_to_high_half(uint32_t val, float x) { +inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) { float16 high_half; // the float16 in higher 16bits high_half.x = static_cast(val >> 16); high_half = static_cast(static_cast(high_half) + x); - return (val & 0xffffu) | (static_cast(high_half.x) << 16); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); } CUDA_ATOMIC_WRAPPER(Add, float16) { // concrete packed float16 value may exsits in lower or higher 16bits // of the 32bits address. - uint32_t *address_as_ui = - reinterpret_cast(reinterpret_cast(address) - - (reinterpret_cast(address) & 2)); + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); float val_f = static_cast(val); uint32_t old = *address_as_ui; uint32_t sum; uint32_t newval; uint32_t assumed; - if (((size_t)address & 2) == 0) { + if (((uintptr_t)address & 0x02) == 0) { // the float16 value stay at lower 16 bits of the address. do { assumed = old; old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f)); } while (old != assumed); float16 ret; - ret.x = old & 0xffffu; + ret.x = old & 0xFFFFu; return ret; } else { // the float16 value stay at higher 16 bits of the address. diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 0f334b2892d77b836728cf79898d1832e90e7c00..a8bc16f1b5b9b624e88e355d8ce4741fcec34bc3 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -534,7 +534,7 @@ EOF make -j `nproc` inference_lib_dist cd ${PADDLE_ROOT}/build cp -r fluid_install_dir fluid - tar -cf fluid.tgz fluid + tar -czf fluid.tgz fluid fi } diff --git a/python/paddle/batch.py b/python/paddle/batch.py index d48c54fcbb66487617b1946bc69724870c8f879c..008509660739d61245526278735064472b8b06dd 100644 --- a/python/paddle/batch.py +++ b/python/paddle/batch.py @@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False): if drop_last == False and len(b) != 0: yield b + # Batch size check + batch_size = int(batch_size) + if batch_size <= 0: + raise ValueError("batch_size should be a positive integeral value, " + "but got batch_size={}".format(batch_size)) + return batch_reader diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 447abfac59648efd51d5a2b0cf0aa9bcc5319f3f..7b61a1e6a27b8614d096a51af41a0bdcc3a0aaf3 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -127,6 +127,7 @@ def __bootstrap__(): ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') + read_env_flags.append('listen_and_serv_profile_period') if core.is_compiled_with_cuda(): read_env_flags += [ diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 1917624fd70ccaa7405b8bc9c859cd3cb9c900d4..9fae96d9bc7f5e6e89f8ee3894d28ae306e10ca9 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -38,6 +38,7 @@ __all__ = [ __auto__ = [ 'iou_similarity', 'box_coder', + 'polygon_box_transform', ] __all__ += __auto__ diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index b3c73a749eab89f7c7039c4843b6188402a70f1a..a82fdf41a653327e0cc5919b81fb37897a338107 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4474,15 +4474,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): "except one unknown dimension.") helper = LayerHelper("reshape", **locals()) - reshaped = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_tmp_variable(dtype=x.dtype) helper.append_op( type="reshape", inputs=inputs, - attrs={"shape": shape, - "inplace": inplace}, - outputs={"Out": reshaped}) + attrs={"shape": shape}, + outputs={"Out": out}) - return helper.append_activation(reshaped) + return helper.append_activation(out) def lod_reset(x, y=None, target_lod=None): diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 60c1413d7c1f9efeb9adbc538e33d70cf5239f88..f70c7f2258ce588444cf46d6c8affc4c9555203e 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -66,9 +66,7 @@ __all__ = [ 'scatter', 'sum', 'slice', - 'polygon_box_transform', 'shape', - 'iou_similarity', 'maxout', ] + __activations__ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index a9bd8930d032a98cbdce284af9f7d0ae1b78bfaf..eabe6bb901ad1b7afe71717eb3f5f5765f261201 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -122,7 +122,7 @@ class ParallelExecutor(object): else: cpu_num = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - exec_strategy.num_threads = cpu_num + exec_strategy.num_threads = cpu_num * 2 if build_strategy is None: build_strategy = BuildStrategy() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f14f0506a905c311c7ea9eda49396bcb1369a35a..8e2a29811dad0176be0274d2e851acb36926c92f 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -62,5 +62,6 @@ if(WITH_DISTRIBUTE) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) +py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..ee8020a73546cb9037e9dc4be589c62bb1b6b937 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -0,0 +1,280 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import time +import math + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +import os +import sys +import transformer_model +import paddle.dataset.wmt16 as wmt16 + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + +WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio" + + +class ModelHyperParams(object): + # Dictionary size for source and target language. This model directly uses + # paddle.dataset.wmt16 in which , and token has + # alreay been added, but the token is not added. Transformer requires + # sequences in a mini-batch are padded to have the same length. A token is + # added into the original dictionary in paddle.dateset.wmt16. + + # size of source word dictionary. + src_vocab_size = 10000 + # index for token in source language. + src_pad_idx = src_vocab_size + + # size of target word dictionay + trg_vocab_size = 10000 + # index for token in target language. + trg_pad_idx = trg_vocab_size + + # position value corresponding to the token. + pos_pad_idx = 0 + + # max length of sequences. It should plus 1 to include position + # padding token for position encoding. + max_length = 50 + + # the dimension for word embeddings, which is also the last dimension of + # the input and output of multi-head attention, position-wise feed-forward + # networks, encoder and decoder. + + d_model = 512 + # size of the hidden layer in position-wise feed-forward networks. + d_inner_hid = 1024 + # the dimension that keys are projected to for dot-product attention. + d_key = 64 + # the dimension that values are projected to for dot-product attention. + d_value = 64 + # number of head used in multi-head attention. + n_head = 8 + # number of sub-layers to be stacked in the encoder and decoder. + n_layer = 6 + # dropout rate used by all dropout layers. + dropout = 0.1 + + +def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. Then, convert the numpy + data to tensors and return a dict mapping names to tensors. + """ + + def __pad_batch_data(insts, + pad_idx, + is_target=False, + return_pos=True, + return_attn_bias=True, + return_max_len=True): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + return_list = [] + max_len = max(len(inst) for inst in insts) + inst_data = np.array( + [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, 1])] + if return_pos: + inst_pos = np.array([[ + pos_i + 1 if w_i != pad_idx else 0 + for pos_i, w_i in enumerate(inst) + ] for inst in inst_data]) + + return_list += [inst_pos.astype("int64").reshape([-1, 1])] + if return_attn_bias: + if is_target: + # This is used to avoid attention on paddings and subsequent + # words. + slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, + max_len)) + slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( + [-1, 1, max_len, max_len]) + slf_attn_bias_data = np.tile(slf_attn_bias_data, + [1, n_head, 1, 1]) * [-1e9] + else: + # This is used to avoid attention on paddings. + slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * + (max_len - len(inst)) + for inst in insts]) + slf_attn_bias_data = np.tile( + slf_attn_bias_data.reshape([-1, 1, 1, max_len]), + [1, n_head, max_len, 1]) + return_list += [slf_attn_bias_data.astype("float32")] + if return_max_len: + return_list += [max_len] + return return_list if len(return_list) > 1 else return_list[0] + + src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, is_target=False) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, is_target=True) + trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], + [1, 1, trg_max_len, 1]).astype("float32") + lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False, + False, False, False) + lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) + + return [ + src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + ] + + +def transformer(use_feed): + assert not use_feed, "transfomer doesn't support feed yet" + return transformer_model.transformer( + ModelHyperParams.src_vocab_size + 1, + ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, ModelHyperParams.n_head, + ModelHyperParams.d_key, ModelHyperParams.d_value, + ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, + ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) + + +def get_model(): + avg_cost = transformer(use_feed=False) + optimizer = fluid.optimizer.Adam() + optimizer.minimize(avg_cost) + return avg_cost + + +def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): + t = fluid.DistributeTranspiler() + t.transpile( + trainer_id=trainer_id, + program=main_program, + pservers=pserver_endpoints, + trainers=trainers) + return t + + +class DistTransformer2x2(object): + def run_pserver(self, pserver_endpoints, trainers, current_endpoint, + trainer_id): + get_model() + t = get_transpiler(trainer_id, + fluid.default_main_program(), pserver_endpoints, + trainers) + pserver_prog = t.get_pserver_program(current_endpoint) + startup_prog = t.get_startup_program(current_endpoint, pserver_prog) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + exe.run(pserver_prog) + + def _wait_ps_ready(self, pid): + retry_times = 20 + while True: + assert retry_times >= 0, "wait ps ready failed" + time.sleep(3) + print("waiting ps ready: ", pid) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + retry_times -= 1 + + def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): + avg_cost = get_model() + if is_dist: + t = get_transpiler(trainer_id, + fluid.default_main_program(), endpoints, + trainers) + trainer_prog = t.get_trainer_program() + else: + trainer_prog = fluid.default_main_program() + + startup_exe = fluid.Executor(place) + startup_exe.run(fluid.default_startup_program()) + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + strategy.allow_op_delay = False + exe = fluid.ParallelExecutor( + True, loss_name=avg_cost.name, exec_strategy=strategy) + + first_loss, = exe.run(fetch_list=[avg_cost.name]) + print(first_loss) + for i in xrange(5): + _ = exe.run(fetch_list=[avg_cost.name]) + last_loss, = exe.run(fetch_list=[avg_cost.name]) + print(last_loss) + + +def main(role="pserver", + endpoints="127.0.0.1:9123", + trainer_id=0, + current_endpoint="127.0.0.1:9123", + trainers=1, + is_dist=True): + + reader = paddle.batch( + wmt16.train(ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=transformer_model.batch_size) + + with fluid.recordio_writer.create_recordio_writer( + WMT16_RECORDIO_FILE) as writer: + for batch in reader(): + for tensor in prepare_batch_input( + batch, ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head): + t = fluid.LoDTensor() + t.set(tensor, fluid.CPUPlace()) + writer.append_tensor(t) + writer.complete_append_tensor() + + model = DistTransformer2x2() + if role == "pserver": + model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) + else: + p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) + + +if __name__ == "__main__": + if len(sys.argv) != 7: + print( + "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" + ) + role = sys.argv[1] + endpoints = sys.argv[2] + trainer_id = int(sys.argv[3]) + current_endpoint = sys.argv[4] + trainers = int(sys.argv[5]) + is_dist = True if sys.argv[6] == "TRUE" else False + main( + role=role, + endpoints=endpoints, + trainer_id=trainer_id, + current_endpoint=current_endpoint, + trainers=trainers, + is_dist=is_dist) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index d5bdc28c5d6080cd3e94935b76b2948db07e3c2f..b27d773f09d9a6daad5a10b65e683f4e11881de1 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -68,6 +68,10 @@ def get_numeric_gradient(place, tensor_to_check_dtype = np.float32 elif tensor_to_check_dtype == core.VarDesc.VarType.FP64: tensor_to_check_dtype = np.float64 + elif tensor_to_check_dtype == core.VarDesc.VarType.FP16: + tensor_to_check_dtype = np.float16 + # set delta as np.float16, will automatic convert to float32, float64 + delta = np.array(delta).astype(np.float16) else: raise ValueError("Not supported data type " + str( tensor_to_check_dtype)) @@ -75,13 +79,24 @@ def get_numeric_gradient(place, gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype) def __get_elem__(tensor, i): - if tensor_to_check_dtype == np.float32: + if tensor_to_check_dtype == np.float16: + numpy_tensor = np.array(tensor).astype(np.float16) + numpy_tensor = numpy_tensor.flatten() + return numpy_tensor[i] + elif tensor_to_check_dtype == np.float32: return tensor._get_float_element(i) else: return tensor._get_double_element(i) def __set_elem__(tensor, i, e): - if tensor_to_check_dtype == np.float32: + if tensor_to_check_dtype == np.float16: + numpy_tensor = np.array(tensor).astype(np.float16) + shape = numpy_tensor.shape + numpy_tensor = numpy_tensor.flatten() + numpy_tensor[i] = e + numpy_tensor = numpy_tensor.reshape(shape).view(np.uint16) + tensor.set(numpy_tensor, place) + elif tensor_to_check_dtype == np.float32: tensor._set_float_element(i, e) else: tensor._set_double_element(i, e) @@ -135,6 +150,11 @@ class OpTest(unittest.TestCase): if not self.call_once: self.call_once = True self.dtype = data_type + # See the comment of np_dtype_to_fluid_dtype + # If the input type is uint16, we assume use float16 + # for lodtensor dtype. + if self.dtype == np.uint16: + self.dtype == np.float16 def infer_dtype_from_inputs_outputs(self, inputs, outputs): def infer_dtype(numpy_dict): @@ -163,19 +183,25 @@ class OpTest(unittest.TestCase): for name, np_value in self.inputs[var_name]: tensor = core.LoDTensor() if isinstance(np_value, tuple): - tensor.set(np_value[0], place) + tensor.set( + OpTest.np_value_to_fluid_value(np_value[0]), place) tensor.set_recursive_sequence_lengths(np_value[1]) else: - tensor.set(np_value, place) + tensor.set( + OpTest.np_value_to_fluid_value(np_value), place) feed_map[name] = tensor else: tensor = core.LoDTensor() if isinstance(self.inputs[var_name], tuple): - tensor.set(self.inputs[var_name][0], place) + tensor.set( + OpTest.np_value_to_fluid_value(self.inputs[var_name][ + 0]), place) tensor.set_recursive_sequence_lengths(self.inputs[var_name][ 1]) else: - tensor.set(self.inputs[var_name], place) + tensor.set( + OpTest.np_value_to_fluid_value(self.inputs[var_name]), + place) feed_map[var_name] = tensor return feed_map @@ -309,13 +335,22 @@ class OpTest(unittest.TestCase): np.allclose( actual_t, expect_t, atol=atol), "Output (" + out_name + ") has diff at " + str(place) + - str(actual_t) + "\n" + str(expect_t)) + "\nExpect " + str(expect_t) + "\n" + "But Got" + + str(actual_t)) if isinstance(expect, tuple): self.assertListEqual(actual.recursive_sequence_lengths(), expect[1], "Output (" + out_name + ") has different lod at " + str(place)) def _get_places(self): + if self.dtype == np.float16: + if core.is_compiled_with_cuda() and core.op_support_gpu( + self.op_type): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + return [place] + else: + return [] places = [fluid.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) @@ -346,9 +381,9 @@ class OpTest(unittest.TestCase): def err_msg(): offset = np.argmax(diff_mat > max_relative_error) return ("%s Variable %s max gradient diff %f over limit %f, " - "the first error element is %d, %f, %f") % ( - msg_prefix, name, max_diff, max_relative_error, - offset, a.flatten()[offset], b.flatten()[offset]) + "the first error element is %d, expected %f, but got %f" + ) % (msg_prefix, name, max_diff, max_relative_error, + offset, a.flatten()[offset], b.flatten()[offset]) self.assertLessEqual(max_diff, max_relative_error, err_msg()) @@ -437,6 +472,21 @@ class OpTest(unittest.TestCase): input.dtype = np.uint16 return input + @staticmethod + def fluid_dtype_to_np_dtype(self, dtype): + """ + See above, convert the dtype to normal type. + """ + if dtype == np.uint16: + dtype = np.float16 + return dtype + + @staticmethod + def np_value_to_fluid_value(input): + if input.dtype == np.float16: + input = input.view(np.uint16) + return input + def _get_gradient(self, input_to_check, place, @@ -459,7 +509,7 @@ class OpTest(unittest.TestCase): if isinstance(place, fluid.CUDAPlace(0)): use_cuda = True executor = fluid.ParallelExecutor( - use_cuda=use_cuda, loss_name=loss.name, main_program=program) + use_cuda=use_cuda, loss_name=loss.name, main_program=prog) else: executor = Executor(place) return list( diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py new file mode 100644 index 0000000000000000000000000000000000000000..1aaab6f906ef6482bc515bb3c42d82431902e1d8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -0,0 +1,137 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import time + +import unittest +import os +import sys +import signal +import subprocess + + +class TestDistBase(unittest.TestCase): + def setUp(self): + self._trainers = 2 + self._pservers = 2 + self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" + self._python_interp = "python" + + def start_pserver(self, model_file): + ps0_ep, ps1_ep = self._ps_endpoints.split(",") + ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ + (self._python_interp, model_file, self._ps_endpoints, ps0_ep, + self._trainers) + ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ + (self._python_interp, model_file, self._ps_endpoints, ps1_ep, + self._trainers) + + ps0_proc = subprocess.Popen( + ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + ps1_proc = subprocess.Popen( + ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return ps0_proc, ps1_proc + + def _wait_ps_ready(self, pid): + retry_times = 50 + while True: + assert retry_times >= 0, "wait ps ready failed" + time.sleep(3) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error as e: + sys.stderr.write('waiting for pserver: %s, left retry %d\n' % + (e, retry_times)) + retry_times -= 1 + + def check_with_place(self, model_file, delta=1e-3): + # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN + required_envs = { + "PATH": os.getenv("PATH"), + "PYTHONPATH": os.getenv("PYTHONPATH"), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"), + "FLAGS_fraction_of_gpu_memory_to_use": "0.15" + } + # Run local to get a base line + env_local = {"CUDA_VISIBLE_DEVICES": "0"} + env_local.update(required_envs) + local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \ + (self._python_interp, model_file, + "127.0.0.1:1234", "127.0.0.1:1234", 1) + local_proc = subprocess.Popen( + local_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env_local) + local_proc.wait() + out, err = local_proc.communicate() + local_ret = out + sys.stderr.write('local_loss: %s\n' % local_ret) + sys.stderr.write('local_stderr: %s\n' % err) + + # Run dist train to compare with local results + ps0, ps1 = self.start_pserver(model_file) + self._wait_ps_ready(ps0.pid) + self._wait_ps_ready(ps1.pid) + + ps0_ep, ps1_ep = self._ps_endpoints.split(",") + tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \ + (self._python_interp, model_file, self._ps_endpoints, ps0_ep, + self._trainers) + tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \ + (self._python_interp, model_file, self._ps_endpoints, ps1_ep, + self._trainers) + + env0 = {"CUDA_VISIBLE_DEVICES": "0"} + env1 = {"CUDA_VISIBLE_DEVICES": "1"} + env0.update(required_envs) + env1.update(required_envs) + FNULL = open(os.devnull, 'w') + + tr0_proc = subprocess.Popen( + tr0_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env0) + tr1_proc = subprocess.Popen( + tr1_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env1) + + tr0_proc.wait() + tr1_proc.wait() + out, err = tr0_proc.communicate() + sys.stderr.write('dist_stderr: %s\n' % err) + loss_data0 = out + sys.stderr.write('dist_loss: %s\n' % loss_data0) + lines = loss_data0.split("\n") + dist_first_loss = eval(lines[0].replace(" ", ","))[0] + dist_last_loss = eval(lines[1].replace(" ", ","))[0] + + local_lines = local_ret.split("\n") + local_first_loss = eval(local_lines[0])[0] + local_last_loss = eval(local_lines[1])[0] + + self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta) + self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta) + + # check tr0_out + # FIXME: ensure the server process is killed + # replace with ps0.terminate() + os.kill(ps0.pid, signal.SIGKILL) + os.kill(ps1.pid, signal.SIGKILL) + FNULL.close() diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index 3b67b3f5ccd67f86f87f292d83a6039ff46260bd..04671d079731ce414561b0ede6bc2b195b07d82a 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -11,127 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import numpy as np -import argparse -import time -import math - import unittest -import os -import sys -import signal -import subprocess - - -class TestDistSeResneXt2x2(unittest.TestCase): - def setUp(self): - self._trainers = 2 - self._pservers = 2 - self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" - self._python_interp = "python" - - def start_pserver(self): - ps0_ep, ps1_ep = self._ps_endpoints.split(",") - ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \ - (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers) - ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \ - (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers) - - ps0_proc = subprocess.Popen( - ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) - ps1_proc = subprocess.Popen( - ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return ps0_proc, ps1_proc - - def _wait_ps_ready(self, pid): - retry_times = 20 - while True: - assert retry_times >= 0, "wait ps ready failed" - time.sleep(3) - try: - # the listen_and_serv_op would touch a file which contains the listen port - # on the /tmp directory until it was ready to process all the RPC call. - os.stat("/tmp/paddle.%d.port" % pid) - return - except os.error: - retry_times -= 1 - - def test_with_place(self): - # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN - required_envs = { - "PATH": os.getenv("PATH"), - "PYTHONPATH": os.getenv("PYTHONPATH"), - "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"), - "FLAGS_fraction_of_gpu_memory_to_use": "0.15" - } - # Run local to get a base line - env_local = {"CUDA_VISIBLE_DEVICES": "0"} - env_local.update(required_envs) - local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \ - (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1) - local_proc = subprocess.Popen( - local_cmd.split(" "), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env_local) - local_proc.wait() - out, err = local_proc.communicate() - local_ret = out - sys.stderr.write('local_loss: %s\n' % local_ret) - sys.stderr.write('local_stderr: %s\n' % err) - - # Run dist train to compare with local results - ps0, ps1 = self.start_pserver() - self._wait_ps_ready(ps0.pid) - self._wait_ps_ready(ps1.pid) - - ps0_ep, ps1_ep = self._ps_endpoints.split(",") - tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \ - (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers) - tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \ - (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers) - - env0 = {"CUDA_VISIBLE_DEVICES": "0"} - env1 = {"CUDA_VISIBLE_DEVICES": "1"} - env0.update(required_envs) - env1.update(required_envs) - FNULL = open(os.devnull, 'w') - - tr0_proc = subprocess.Popen( - tr0_cmd.split(" "), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env0) - tr1_proc = subprocess.Popen( - tr1_cmd.split(" "), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env1) - - tr0_proc.wait() - tr1_proc.wait() - out, err = tr0_proc.communicate() - sys.stderr.write('dist_stderr: %s\n' % err) - loss_data0 = out - sys.stderr.write('dist_loss: %s\n' % loss_data0) - lines = loss_data0.split("\n") - dist_first_loss = eval(lines[0].replace(" ", ","))[0] - dist_last_loss = eval(lines[1].replace(" ", ","))[0] - - local_lines = local_ret.split("\n") - local_first_loss = eval(local_lines[0])[0] - local_last_loss = eval(local_lines[1])[0] +from test_dist_base import TestDistBase - self.assertAlmostEqual(local_first_loss, dist_first_loss) - self.assertAlmostEqual(local_last_loss, dist_last_loss) - # check tr0_out - # FIXME: ensure the server process is killed - # replace with ps0.terminate() - os.kill(ps0.pid, signal.SIGKILL) - os.kill(ps1.pid, signal.SIGKILL) - FNULL.close() +class TestDistSeResneXt2x2(TestDistBase): + def test_se_resnext(self): + # TODO(paddle-dev): Is the delta too large? + self.check_with_place("dist_se_resnext.py", delta=0.2) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..68cd35d751dbce7eef9919dc8678fc0dd117757b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from test_dist_base import TestDistBase + + +class TestDistTransformer2x2(TestDistBase): + def test_transformer(self): + # TODO(paddle-dev): check if the delta is OK. + # Usually start around ~8000 and converge to ~5000 + self.check_with_place("dist_transformer.py", delta=400) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py index acf652d3fb9743d69b7f7e248ff7a3ee83fc4c50..1854232194963bcbe302010320a30d85747eea96 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py @@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest): def setUp(self): self.op_type = "elementwise_sub" self.inputs = { - 'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"), - 'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32") + 'X': np.random.uniform(0.1, 1, [2, 3]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [2, 3]).astype("float32") } self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index d090960c84e47da68a0ebea4609dfc3ed76e114e..daa5da8d95129af0305b326832a557daeb4c5c9c 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -17,6 +17,8 @@ import numpy as np import math from op_test import OpTest +np.random.seed(100) + def find_latest_set(num): return 1 + int(math.floor(math.log(num, 2))) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index bbddf1a0a4057aa5a0822659f328745f4f0c58f7..8f2dac786d03334642714d221710a20833939c7a 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -465,6 +465,15 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def test_shape(self): + program = Program() + with program_guard(program): + input = layers.data( + name="input", shape=[3, 100, 100], dtype="float32") + out = layers.shape(input, name="shape") + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py index cfd6e63e12258a92447e68b4afbc7ead91b68cc1..67733807f8f8582f68dcfa3f361e13a631a29597 100644 --- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py @@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase): print(str(result_program)) +class TestMemoryTranspiler2(unittest.TestCase): + def setUp(self): + program = Program() + with program_guard(program, startup_program=Program()): + x = layers.data(name='x', shape=[13], dtype='float32') + fc = layers.fc(input=x, size=10, act=None) + reshape = layers.reshape(x=fc, shape=[-1, 2, 5]) + fc = layers.reshape(x=reshape, shape=[-1, 5, 2]) + y_predict = layers.fc(input=fc, size=1, act=None) + y = layers.data(name='y', shape=[1], dtype='float32') + cost = layers.square_error_cost(input=y_predict, label=y) + avg_cost = layers.mean(cost) + opt = optimizer.SGD(learning_rate=0.001) + opt.minimize(avg_cost) + self.program = program + + def test_inplace_ops(self): + print("before optimization") + print(str(self.program)) + result_program = memory_optimize(self.program) + print("after optimization") + print(str(result_program)) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 95fbe6b016001d2c05956190f82ca656b9d1b451..c5cbfcf92d4ae5bfeccbe3210d348ee36daab82c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -211,7 +211,8 @@ class TestMNIST(TestParallelExecutorBase): self.check_batchnorm_fc_convergence(False) def test_batchnorm_fc_with_new_strategy(self): - self._compare_reduce_and_allreduce(fc_with_batchnorm, True) + # FIXME(zcd): close this test temporally. + # self._compare_reduce_and_allreduce(fc_with_batchnorm, True) self._compare_reduce_and_allreduce(fc_with_batchnorm, False) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index b6215fddb11bb6b3a76b5a6395e7254d21971c13..8203d5d1fce0950130ab71db40fb306f73c41bd4 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -21,7 +21,7 @@ import paddle import paddle.dataset.wmt16 as wmt16 import os -WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio" +WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio" class ModelHyperParams(object): @@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase): writer.append_tensor(t) writer.complete_append_tensor() - @unittest.skip("transformer is buggy in multi gpu") def test_main(self): self.check_network_convergence(transformer, use_cuda=True) - self.check_network_convergence(transformer, use_cuda=False) + self.check_network_convergence(transformer, use_cuda=False, iter=5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index f51b5a7e9907294a5b91c920a363830d8b9a7137..2f5558578ac2a002a83c2a7e027ec5a96d8b4414 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -25,7 +25,7 @@ class TestReshapeOp(OpTest): self.op_type = "reshape" self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape, "inplace": False} + self.attrs = {"shape": new_shape} self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} def test_check_output(self): @@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest): self.op_type = "reshape" self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape, "inplace": False} + self.attrs = {"shape": new_shape} self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])} def test_check_output(self): @@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest): self.op_type = "reshape" self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape, "inplace": False} + self.attrs = {"shape": new_shape} self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py index e9f0a06a56b42952800411d548bb3fc1732e031e..ca7861309839d183e18c168403881a0b1b5bf309 100644 --- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py @@ -15,6 +15,8 @@ import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core +from paddle.fluid.op import Operator class TestSplitIdsOp(OpTest): @@ -31,5 +33,55 @@ class TestSplitIdsOp(OpTest): self.check_output() +class TestSpliteIds(unittest.TestCase): + def get_places(self): + places = [core.CPUPlace()] + return places + + def test_check_output(self): + for place in self.get_places(): + self.check_with_place(place) + + def check_with_place(self, place): + scope = core.Scope() + rows = [0, 5, 7, 4, 9] + height = 20 + row_numel = 2 + + # initialize input variable X + x = scope.var('X').get_selected_rows() + x.set_rows(rows) + x.set_height(height) + np_array = np.ones((len(rows), row_numel)).astype("float32") + for i in range(len(rows)): + for j in range(row_numel): + np_array[i, j] = rows[i] + j + x_tensor = x.get_tensor() + x_tensor.set(np_array, place) + + outs_name = ["out%d" % i for i in xrange(3)] + outs = [ + scope.var(var_name).get_selected_rows() for var_name in outs_name + ] + + # expected output selected rows + expected_out_rows = [[0, 9], [7, 4], [5]] + + op = Operator("split_ids", Ids="X", Out=outs_name) + + for _ in range(3): + op.run(scope, place) + + for i in range(len(outs)): + expected_rows = expected_out_rows[i] + self.assertEqual(outs[i].rows(), expected_rows) + for j in range(len(expected_rows)): + row = expected_rows[j] + self.assertAlmostEqual( + float(row), np.array(outs[i].get_tensor())[j, 0]) + self.assertAlmostEqual( + float(row + 1), np.array(outs[i].get_tensor())[j, 1]) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py index bfe186d187901e9ba09fa32514e759f7d1f3ce8e..c6e176ca31c57e623addd9594be81c0abdce489b 100644 --- a/python/paddle/fluid/tests/unittests/testsuite.py +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -18,14 +18,6 @@ import paddle.fluid.core as core from paddle.fluid.op import Operator -def as_lodtensor(np_array, lod, place): - tensor = core.LoDTensor() - tensor.set(np_value, place) - if lod is not None: - tensor.set_recursive_sequence_lengths(lod) - return tensor - - def create_op(scope, op_type, inputs, outputs, attrs): kwargs = dict() @@ -69,6 +61,11 @@ def create_op(scope, op_type, inputs, outputs, attrs): def set_input(scope, op, inputs, place): + def np_value_to_fluid_value(input): + if input.dtype == np.float16: + input = input.view(np.uint16) + return input + def __set_input__(var_name, var): if isinstance(var, tuple) or isinstance(var, np.ndarray): tensor = scope.find_var(var_name).get_tensor() @@ -76,7 +73,7 @@ def set_input(scope, op, inputs, place): tensor.set_recursive_sequence_lengths(var[1]) var = var[0] tensor._set_dims(var.shape) - tensor.set(var, place) + tensor.set(np_value_to_fluid_value(var), place) elif isinstance(var, float): scope.find_var(var_name).set_float(var) elif isinstance(var, int): @@ -104,6 +101,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype): if name not in np_list: assert var_proto.intermediate, "{} not found".format(name) else: + # inferece the dtype from numpy value. np_value = np_list[name] if isinstance(np_value, tuple): dtype = np_value[0].dtype @@ -116,6 +114,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype): if is_input: shape = list(np_value.shape) lod_level = 0 + # NOTE(dzhwinter): type hacking + # numpy float16 is binded to paddle::platform::float16 + # in tensor_py.h via the help of uint16 datatype. Because + # the internal memory representation of float16 is + # actually uint16_t in paddle. So we use np.uint16 in numpy for + # raw memory, it can pass through the pybind. So in the testcase, + # we feed data use data.view(uint16), but the dtype is float16 in fact. + # The data.view(uint16) means do not cast the data type, but process data as the uint16 + if dtype == np.uint16: + dtype = np.float16 return block.create_var( dtype=dtype, shape=shape, lod_level=lod_level, name=name) diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index 53f8cdfd00350eafa9dad82588f8307d02c64324..17ab875f6a555c99bbe480a54ea2df3b1f60de2d 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -404,7 +404,7 @@ def transformer( trg_pad_idx, pos_pad_idx, ): file_obj = fluid.layers.open_recordio_file( - filename='./wmt16.recordio', + filename='/tmp/wmt16.recordio', shapes=[ [batch_size * max_length, 1], [batch_size * max_length, 1], diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 8b5a48e7eda3a3ef13d0492b542edd625c8a11e3..4d6761436ec47379c40005b80e2e98de6e8ddc52 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -494,6 +494,7 @@ class DistributeTranspiler(object): pserver_index = self.pserver_endpoints.index(endpoint) table_opt_block = self._create_table_optimize_block( pserver_index, pserver_program, pre_block_idx, grad_to_block_id) + optimize_blocks.append(table_opt_block) prefetch_var_name_to_block_id = self._create_prefetch_block( pserver_index, pserver_program, table_opt_block) checkpoint_block_id = self._create_checkpoint_save_block( diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64 index bca0b77ad71a3f65dda15191e5f540bfc2e043d1..0b72ea323b72a1a6cfd0911416c4037243d06ff4 100644 --- a/tools/manylinux1/Dockerfile.x64 +++ b/tools/manylinux1/Dockerfile.x64 @@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH} ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig -RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz +RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz COPY build_scripts /build_scripts RUN bash build_scripts/build.sh && \ bash build_scripts/install_nccl2.sh && rm -r build_scripts