diff --git a/CMakeLists.txt b/CMakeLists.txt index 231224f9249848b6e4981a98e0538794bf5d3c08..bdd48565edeca051f54e8fe4eb51cd1dbd5e836a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,6 +200,14 @@ include(external/snappy) # download snappy include(external/snappystream) include(external/threadpool) +if(WITH_GPU) + include(cuda) + include(tensorrt) + include(external/anakin) +else() + set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE) +endif() + include(cudnn) # set cudnn libraries, must before configure include(cupti) include(configure) # add paddle env configuration @@ -228,14 +236,6 @@ set(EXTERNAL_LIBS ${PYTHON_LIBRARIES} ) -if(WITH_GPU) - include(cuda) - include(tensorrt) - include(external/anakin) -else() - set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE) -endif() - if(WITH_AMD_GPU) find_package(HIP) include(hip) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 2c84061ff572de4687b4d496f8ded6deee8d1011..9eebea816cbfc91052c95ecf99ecc4b0bea4e4c2 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS ${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu + ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index fb3d8ef8d53436f387acc3069a0eb887e6f07c59..8b7d91f234594becdda805c089fac0bb4e4e8e44 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library") set(ANAKIN_COMPILE_EXTRA_FLAGS + -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable -Wno-error=unused-variable -Wno-unused-variable -Wno-error=format-extra-args -Wno-format-extra-args -Wno-error=comment -Wno-comment @@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-reorder -Wno-error=cpp) -set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz") +set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz") # A helper function used in Anakin, currently, to use it, one need to recursively include # nearly all the header files. @@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}") message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}") execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*") - execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}") + execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}") execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") - execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz") + execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz") endif() if (WITH_ANAKIN) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 5f3bfa296546fcbc6a3410d7ae072ff74954bc74..3ef317bb7a1c25c5738342f34ae7994b0184a7de 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -263,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) @@ -306,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', ' paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) +paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 139411f3e0d945f9265d19a28487c05d06722d69..6440607dbe4666ff3ff91dc526465706b3b9c1f0 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -7,6 +7,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) +cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) if(WITH_GPU) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) else() diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 60382faffb8e53870658b2d1ff83abc4008cb4cf..1a9ce746ea840bc088d222cc4e9bc05159d64734 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -17,6 +17,8 @@ #include #include +using float16 = paddle::platform::float16; + namespace paddle { namespace framework { @@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() { RegisterType(retv, proto_type, #cc_type) // NOTE: Add your customize type here. - RegType(platform::float16, proto::VarType::FP16); + RegType(float16, proto::VarType::FP16); RegType(float, proto::VarType::FP32); RegType(double, proto::VarType::FP64); RegType(int, proto::VarType::INT32); diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..54c41c55ba63c0b2001cfcb6a9e94fbb0036d437 --- /dev/null +++ b/paddle/fluid/framework/data_type_test.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/data_type.h" + +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor.h" + +TEST(DataType, float16) { + using paddle::framework::Tensor; + using paddle::platform::CPUPlace; + using paddle::platform::float16; + namespace f = paddle::framework; + f::proto::VarType::Type dtype = f::proto::VarType::FP16; + + Tensor tensor; + CPUPlace cpu; + tensor.mutable_data(cpu, f::ToTypeIndex(dtype)); + + // test fp16 tensor + EXPECT_EQ(tensor.type(), std::type_index(typeid(float16))); + + // test fp16 size + EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u); + + // test debug info + std::string type = "float16"; + EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str()); +} diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h new file mode 100644 index 0000000000000000000000000000000000000000..6e302a29233b96451df14b4685911be1cd87c1ab --- /dev/null +++ b/paddle/fluid/framework/details/exception_holder.h @@ -0,0 +1,83 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace details { + +class ExceptionHolder { + public: + void Catch(const platform::EnforceNotMet& exp) { + std::lock_guard lock(mu_); + exception_.reset(new platform::EnforceNotMet(exp)); + type_ = kEnforceNotMet; + } + + void Catch(const platform::EOFException& exp) { + std::lock_guard lock(mu_); + // EOFException will not cover up existing EnforceNotMet. + if (exception_.get() == nullptr) { + exception_.reset(new platform::EOFException(exp)); + type_ = kEOF; + } + } + + bool ExceptionCatched() const { + std::lock_guard lock(mu_); + return exception_.get() != nullptr; + } + + void Throw() { + std::lock_guard lock(mu_); + switch (type_) { + case kNone: + break; + case kEnforceNotMet: { + auto e = *static_cast(exception_.get()); + throw e; + break; + } + case kEOF: { + auto e = *static_cast(exception_.get()); + throw e; + break; + } + default: + LOG(FATAL) << "Unknown exception."; + } + exception_.reset(); + type_ = kNone; + } + + void Clear() { + std::lock_guard lock(mu_); + exception_.reset(); + type_ = kNone; + } + + private: + enum ExceptionType { kNone, kEnforceNotMet, kEOF }; + ExceptionType type_{kNone}; + + std::unique_ptr exception_; + mutable std::mutex mu_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index eec405073377b2782d7636c08e6eb3a7bd41202d..e556c84b0219eba2b92c456c205e03947171626b 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Clean run context run_op_futures_.clear(); - exception_.reset(); + exception_holder_.Clear(); // Step 3. Execution while (!pending_vars.empty()) { @@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto cur_ready_vars = ready_vars.PopAll(1, &timeout); if (timeout) { - std::unique_lock l(exception_mu_); - if (exception_) { - l.unlock(); + if (exception_holder_.ExceptionCatched()) { for (auto &run_op_future : run_op_futures_) { run_op_future.wait(); } - l.lock(); - std::exception *exp = exception_.get(); - if (dynamic_cast(exp)) { - auto e = *static_cast(exp); - throw e; - } else if (dynamic_cast(exp)) { - auto e = *static_cast(exp); - throw e; - } else { - LOG(FATAL) << "Unknown exception."; - } + exception_holder_.Throw(); } else { continue; } @@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp( ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; } catch (platform::EOFException ex) { - std::lock_guard l(exception_mu_); - // EOFException will not cover up existing EnforceNotMet. - if (exception_.get() == nullptr) { - exception_.reset(new platform::EOFException(ex)); - } + exception_holder_.Catch(ex); } catch (platform::EnforceNotMet ex) { - std::lock_guard l(exception_mu_); - exception_.reset(new platform::EnforceNotMet(ex)); + exception_holder_.Catch(ex); } catch (...) { LOG(FATAL) << "Unknown exception catched"; } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index b0aaf60701fcfdfac0d9f2e547e6fb5edf63156c..9135c1f5d435d5e2c60eb90c80803361aa31a3c4 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -24,6 +24,7 @@ #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" @@ -59,8 +60,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; - std::mutex exception_mu_; - std::unique_ptr exception_; + ExceptionHolder exception_holder_; std::atomic running_ops_; void InsertPendingOp(std::unordered_map *pending_ops, diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc index b517442bb73f43bc1cb1d639b6c6cf004b28d4cf..a260dd3da2a7863c06e51aa4feafd824ea254139 100644 --- a/paddle/fluid/framework/ir/graph_helper_test.cc +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -116,8 +116,8 @@ TEST(GraphHelperTest, Basic) { for (size_t i = 0; i < sorted.size(); ++i) { node_map[sorted[i]->Name()] = i; } - ASSERT_EQ(node_map.at("op1"), 0); - ASSERT_EQ(node_map.at("op2"), 1); + ASSERT_EQ(node_map.at("op1"), 0UL); + ASSERT_EQ(node_map.at("op2"), 1UL); ASSERT_TRUE(node_map.at("op3") < node_map.at("op5")); } } // namespace ir diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index 73ef55756c330bdbc3be89c436967b2a88625a43..f9e6bdf3625bdced9d1a9195a979b0f46016d8bf 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -97,15 +97,15 @@ TEST(GraphTest, Basic) { std::vector nodes(g->Nodes().begin(), g->Nodes().end()); for (ir::Node *n : nodes) { if (n->Name() == "sum") { - ASSERT_EQ(n->inputs.size(), 3); - ASSERT_EQ(n->outputs.size(), 1); + ASSERT_EQ(n->inputs.size(), 3UL); + ASSERT_EQ(n->outputs.size(), 1UL); } else if (n->Name() == "test_a" || n->Name() == "test_b" || n->Name() == "test_c") { - ASSERT_EQ(n->inputs.size(), 0); - ASSERT_EQ(n->outputs.size(), 1); + ASSERT_EQ(n->inputs.size(), 0UL); + ASSERT_EQ(n->outputs.size(), 1UL); } else if (n->Name() == "test_out") { - ASSERT_EQ(n->inputs.size(), 1); - ASSERT_EQ(n->outputs.size(), 0); + ASSERT_EQ(n->inputs.size(), 1UL); + ASSERT_EQ(n->outputs.size(), 0UL); } } ASSERT_EQ(nodes.size(), 5); diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc index db95861c510b52a5b52229541434e6437d3fb9f4..3e17a512ce154de88ac890f3b29f03385595d95c 100644 --- a/paddle/fluid/framework/op_kernel_type_test.cc +++ b/paddle/fluid/framework/op_kernel_type_test.cc @@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) { ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type), "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type[" "CUDNN]"); + + using CUDAPlace = paddle::platform::CUDAPlace; + OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW, + LibraryType::kCUDNN); + ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2), + "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_" + "type[CUDNN]"); } TEST(OpKernelType, Hash) { diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 001b5cb5a8eb57cbe0a2e0ad7f64ef05f8149922..2288c7fe6609a765612b468d69ad35101b92b384 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput( return OpProtoAndCheckerMaker::VariableBuilder{output}; } +void OpProtoAndCheckerMaker::Reuse(const std::string& name, + const std::string& reused_name) { + bool found = false; + proto::OpProto::Var* var; + + for (auto& var : proto_->inputs()) { + if (var.name() == reused_name) { + found = true; + break; + } + } + PADDLE_ENFORCE(found == true, + "Input/Output name: %s reused_name: %s, one of them is not " + "exists or not matched.", + name, reused_name); + + found = false; + for (int i = 0; i < proto_->outputs().size(); ++i) { + var = proto_->mutable_outputs()->Mutable(i); + if (var->name() == name) { + PADDLE_ENFORCE(!var->has_reuse(), + "Output(%s) has been set reused var of %s", name, + var->reuse()); + found = true; + var->set_reuse(reused_name); + break; + } + } + PADDLE_ENFORCE(found == true, + "Input/Output name: %s reused_name: %s, one of them is not " + "exists or not matched.", + name, reused_name); +} + void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() { std::unordered_set names; auto checker = [&](const std::string& name) { diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 92f86bb5de520878d0a7b8d7214620580242c061..80970291c9c234f1306162f4ffa3c2528f88c35f 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker { VariableBuilder AddOutput(const std::string &name, const std::string &comment); + void Reuse(const std::string &name, const std::string &reused_name); + template TypedAttrChecker &AddAttr(const std::string &name, const std::string &comment, diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc index 58f70cb39c0d96ed3b9ff35ea132ba75a37f5405..b71c7b646857e11f291748c4c7c2af92b6d53231 100644 --- a/paddle/fluid/framework/op_proto_maker_test.cc +++ b/paddle/fluid/framework/op_proto_maker_test.cc @@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) { } class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "input of test op"); + AddOutput("XOut", "output of test op").Reuse("X"); + } +}; + +class TestInplaceProtoMaker2 + : public paddle::framework::OpProtoAndCheckerMaker { public: void Make() { AddInput("X", "input of test op"); @@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { }; TEST(ProtoMaker, InplaceOutput) { - paddle::framework::proto::OpProto op_proto; + paddle::framework::proto::OpProto op_proto, op_proto2; paddle::framework::OpAttrChecker op_checker; TestInplaceProtoMaker proto_maker; - ASSERT_THROW(proto_maker(&op_proto, &op_checker), + TestInplaceProtoMaker2 proto_maker2; + + proto_maker(&op_proto, &op_checker); + + ASSERT_THROW(proto_maker2(&op_proto2, &op_checker), paddle::platform::EnforceNotMet); - // proto_maker(&op_proto, &op_checker); - // proto_maker.Make(); - // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); } + +// normal reuse +class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "input of test op"); + AddInput("Y", "input of test op"); + AddOutput("Out", "output of test op"); + AddOutput("XOut", "output of test op"); + // avoid destructor exception. + // Validate(); + TestReuse(); + } + + virtual void TestReuse() {} +}; + +// test duplicate reuse error +class TestReuseProtoMaker2 : public TestReuseProtoMaker { + public: + void TestReuse() { + Reuse("Out", "X"); + Reuse("Out", "Y"); + } +}; + +// NotExists Input +class TestReuseProtoMaker3 : public TestReuseProtoMaker { + public: + void TestReuse() { + Reuse("Out", "NotExists"); + Reuse("XOut", "X"); + } +}; + +// NotExists Output +class TestReuseProtoMaker4 : public TestReuseProtoMaker { + public: + void TestReuse() { Reuse("NotExists", "X"); } +}; + +TEST(ProtoMaker, Reuse) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + TestReuseProtoMaker proto_maker; + proto_maker(&op_proto, &op_checker); +} + +// NOTE(dzhwinter): +// There is a Fatal CHECK on base class destructor, which will call abort inside +// instead of +// throw an exception. If we throw an exception in Make(), we will trigger the +// CHECK and terminate the tests. +// +// I had tried to replace the default CHECK with a exception, however, it's +// still not supported by glog. +// the details: +// https://github.com/google/glog/issues/249 +// https://github.com/facebookresearch/TensorComprehensions/issues/351 +/* +TEST(ProtoMaker, ReuseWithException) { + paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4; + paddle::framework::OpAttrChecker op_checker; + TestReuseProtoMaker2 proto_maker2; + TestReuseProtoMaker3 proto_maker3; + TestReuseProtoMaker4 proto_maker4; + EXPECT_THROW(proto_maker2(&op_proto2, &op_checker), + paddle::platform::EnforceNotMet); + + EXPECT_THROW(proto_maker3(&op_proto3, &op_checker), + paddle::platform::EnforceNotMet); + + EXPECT_THROW(proto_maker4(&op_proto4, &op_checker), + paddle::platform::EnforceNotMet); +} + +void FailureFunction() { + throw std::runtime_error("Check failed in destructor."); + // return 0; +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + google::InstallFailureFunction(&FailureFunction); + return RUN_ALL_TESTS(); +} +*/ diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 7c1c29fd9a81c558f7fd05abf52cd0a6dd522190..cdac00739bc48648b41751e644a953d0d310ffbf 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/var_type.h" @@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name, } if (var->IsType()) { - return var->Get().dims(); + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return DDim({-1}); + } + return tensor.dims(); } else if (var->IsType()) { if (get_actual_dim) { return var->Get().value().dims(); @@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name, } } +static std::string GetDtype(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + return ""; + } + + if (var->IsType()) { + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return ""; + } + return DataTypeToString(ToDataType(tensor.type())); + } else if (var->IsType()) { + return DataTypeToString( + ToDataType(var->Get().value().type())); + } else { + return ""; + } +} + static int GetRowSize(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); if (var == nullptr) { @@ -91,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } if (var->IsType()) { - return var->Get().lod(); + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return default_lod; + } + return tensor.lod(); } else { return default_lod; } @@ -172,6 +201,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { if (row_size >= 0) { ss << "[row_size=" << row_size << "]"; } + std::string dtype = GetDtype(*scope, input.second[i]); + ss << ":" << dtype; ss << "[" << GetDims(*scope, input.second[i], true) << "]"; ss << "(" << GetLoD(*scope, input.second[i]) << ")"; } diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index ef224d68f1fc561f45e9d7a81425e62655457648..0bbfd66148e9bc9080654bf1b0b34477115a0e6b 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -82,7 +82,7 @@ class Tensor { template const T* data() const; - bool IsInitialized() const; + inline bool IsInitialized() const; /** * @brief Return a pointer to mutable memory block. diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 0a1cb6d5703dace5e6be73285655ecd9d2ad89fb..cb2061c06a429d8e8116001a4aa4e8c46ea13428 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/tensor.h" #include #include +#include "paddle/fluid/platform/float16.h" namespace framework = paddle::framework; namespace platform = paddle::platform; @@ -213,3 +214,17 @@ TEST(Tensor, Layout) { src.set_layout(framework::DataLayout::kAnyLayout); ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); } + +TEST(Tensor, FP16) { + using platform::float16; + framework::Tensor src; + float16* src_ptr = src.mutable_data({2, 3}, platform::CPUPlace()); + for (int i = 0; i < 2 * 3; ++i) { + src_ptr[i] = static_cast(i); + } + EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16)); + // EXPECT a human readable error message + // src.data(); + // Tensor holds the wrong type, it holds N6paddle8platform7float16E at + // [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43] +} diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 67d355d10d3c9e11b59c9ce9d208826523095459..27fe575cb6167a726ff92a8f3d2e47b6f536ba39 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph tensorrt_subgraph_node_mark_pass.cc analyzer.cc helper.cc + model_store_pass.cc DEPS framework_proto proto_desc) cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) +cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) @@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_ inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc) +inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index b3a1075e5adf4a24bf32017574c061f36c46ba8c..98bdfcc00b9f0e8f40dfc92e4021b2bd6fb19313 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/inference/analysis/model_store_pass.h" #include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" @@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, DEFINE_string(inference_analysis_graphviz_log_root, "./", "Graphviz debuger for data flow graphs."); +DEFINE_string(inference_analysis_output_storage_path, "", + "optimized model output path"); + namespace inference { namespace analysis { @@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager { AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller)); } AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); + if (!FLAGS_inference_analysis_output_storage_path.empty()) { + AddPass("model-store-pass", new ModelStorePass); + } } std::string repr() const override { return "dfg-pass-manager"; } diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 0132bf5b9c6552391aaa19542669487f42b685a7..c82fdfff86c91b4e07e3c1b80987d3d8d796ad23 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -16,28 +16,23 @@ limitations under the License. */ /* * This file contains Analyzer, an class that exposed as a library that analyze - * and optimize - * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to - * control whether - * an process is applied on the program. + * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has + * multiple flags to + * control whether an process is applied on the program. * * The processes are called Passes in analysis, the Passes are placed in a - * pipeline, the first - * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to - * a data flow - * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow - * graph to a - * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes - * which take a - * node or data flow graph as input. + * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a + * Fluid ProgramDesc to + * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms + * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the + * pipeline can be any Passes + * which take a node or data flow graph as input. * * The Analyzer can be used in two methods, the first is a executable file which - * can be used to - * pre-process the inference model and can be controlled by passing difference - * command flags; + * can be used to pre-process the inference model and can be controlled by + * passing difference command flags; * the other way is to compose inside the inference API as a runtime pre-process - * phase in the - * inference service. + * phase in the inference service. */ #include @@ -50,6 +45,7 @@ namespace paddle { // flag if not available. DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine); DECLARE_string(inference_analysis_graphviz_log_root); +DECLARE_string(inference_analysis_output_storage_path); namespace inference { namespace analysis { diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/analysis/analyzer_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e1fe3eb797cdced56a61aa2db0c3d18601824f8 --- /dev/null +++ b/paddle/fluid/inference/analysis/analyzer_main.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file implements analysizer -- an executation help to analyze and + * optimize trained model. + */ +#include "paddle/fluid/inference/analysis/analyzer.h" +#include +#include + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + using paddle::inference::analysis::Analyzer; + using paddle::inference::analysis::Argument; + + Argument argument; + Analyzer analyzer; + analyzer.Run(&argument); + + return 0; +} diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 25a440e7e71fddb38cc515f99d15231675a8172e..24bfb3993cf569561980006b6627b56327dd0f67 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -20,14 +20,18 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, analysis_without_tensorrt) { +TEST(Analyzer, analysis_without_tensorrt) { FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false; + Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } -TEST_F(DFG_Tester, analysis_with_tensorrt) { +TEST(Analyzer, analysis_with_tensorrt) { FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true; + Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 6d316f20bff7a68754b0afec6463bd5d7579227f..a17d6281a2976f0600c7ce94c2d43e65d30de265 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -23,6 +23,7 @@ #pragma once +#include #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" @@ -36,6 +37,16 @@ namespace analysis { * All the fields should be registered here for clearness. */ struct Argument { + Argument() = default; + explicit Argument(const std::string& fluid_model_dir) + : fluid_model_dir(new std::string(fluid_model_dir)) {} + // The directory of the trained model. + std::unique_ptr fluid_model_dir; + // The path of `__model__` and `param`, this is used when the file name of + // model and param is changed. + std::unique_ptr fluid_model_program_path; + std::unique_ptr fluid_model_param_path; + // The graph that process by the Passes or PassManagers. std::unique_ptr main_dfg; @@ -44,6 +55,9 @@ struct Argument { // The processed program desc. std::unique_ptr transformed_program_desc; + + // The output storage path of ModelStorePass. + std::unique_ptr model_output_store_path; }; #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index bc1875f4d851c5d28d290357d94528fe3303f631..16aeae4d35e7bd54646053190da7f47eaca69aa0 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -176,7 +176,7 @@ struct GraphTraits { // sub-graph is the inputs nodes and output nodes that doesn't inside the // sub-graph. std::pair, std::vector> -ExtractInputAndOutputOfSubGraph(std::vector &graph); +ExtractInputAndOutputOfSubGraph(std::vector &graph); // NOLINT } // namespace analysis } // namespace inference diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc index 7912f8d7f17ae3c79e8f73f36b7095fd52c9ac86..a881262665f156812da9e1576aa29b05fc398499 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -20,7 +20,7 @@ namespace inference { namespace analysis { TEST(DataFlowGraph, BFS) { - auto desc = LoadProgramDesc(); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); dfg.Build(); @@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) { } TEST(DataFlowGraph, DFS) { - auto desc = LoadProgramDesc(); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); dfg.Build(); GraphTraits trait(&dfg); diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc index d8fc5e580a98f76233f01fdc4d7987311f78ee45..4ef381db295b986b91173a728b6d98640f6f4f51 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc @@ -26,21 +26,21 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, Test) { - DataFlowGraph graph; +TEST(DataFlowGraph, Test) { + Argument argument(FLAGS_inference_model_dir); FluidToDataFlowGraphPass pass0; DataFlowGraphToFluidPass pass1; ASSERT_TRUE(pass0.Initialize(&argument)); ASSERT_TRUE(pass1.Initialize(&argument)); - pass0.Run(&graph); - pass1.Run(&graph); + pass0.Run(argument.main_dfg.get()); + pass1.Run(argument.main_dfg.get()); pass0.Finalize(); pass1.Finalize(); - LOG(INFO) << graph.nodes.size(); + LOG(INFO) << argument.main_dfg->nodes.size(); } }; // namespace analysis diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc index 65842b1e850953e77e3d4d28416609be271af9f1..928be7917047382d9b86294f6039b26b0ebf6f49 100644 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc @@ -23,12 +23,18 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { - auto dfg = ProgramDescToDFG(*argument.origin_program_desc); +TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) { + Argument argument(FLAGS_inference_model_dir); + FluidToDataFlowGraphPass pass0; + ASSERT_TRUE(pass0.Initialize(&argument)); + pass0.Run(argument.main_dfg.get()); + + // auto dfg = ProgramDescToDFG(*argument.origin_program_desc); + DFG_GraphvizDrawPass::Config config("./", "test"); DFG_GraphvizDrawPass pass(config); pass.Initialize(&argument); - pass.Run(&dfg); + pass.Run(argument.main_dfg.get()); // test content std::ifstream file("./0-graph_test.dot"); diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc index 88fdf8c9cb4ce5369d70d416bbcfe6a4c7f23a98..511631d3e067f14bc1230d9e4b4d92dbe604e1d4 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include @@ -25,8 +26,20 @@ namespace analysis { bool FluidToDataFlowGraphPass::Initialize(Argument *argument) { ANALYSIS_ARGUMENT_CHECK_FIELD(argument); - ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc); - PADDLE_ENFORCE(argument); + if (argument->origin_program_desc) { + LOG(WARNING) << "argument's origin_program_desc is already set, might " + "duplicate called"; + } + if (!argument->fluid_model_program_path) { + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir); + argument->fluid_model_program_path.reset( + new std::string(*argument->fluid_model_dir + "/__model__")); + } + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path); + auto program = LoadProgramDesc(*argument->fluid_model_program_path); + argument->origin_program_desc.reset( + new framework::proto::ProgramDesc(program)); + if (!argument->main_dfg) { argument->main_dfg.reset(new DataFlowGraph); } diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc index dadb84059d21adab44159a6145b345460663cb96..d218dcd05015aa4636c16569de4addf4936c8cd5 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc @@ -21,8 +21,9 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, Init) { +TEST(FluidToDataFlowGraphPass, Test) { FluidToDataFlowGraphPass pass; + Argument argument(FLAGS_inference_model_dir); pass.Initialize(&argument); pass.Run(argument.main_dfg.get()); // Analysis is sensitive to ProgramDesc, careful to change the original model. diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index f1064cd20f28092d80d3fd23a862da080b6cc2f3..a0f912b251d5ea29594a7f601d5b2bce91201790 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) { } } +static framework::proto::ProgramDesc LoadProgramDesc( + const std::string &model_path) { + std::ifstream fin(model_path, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path); + fin.seekg(0, std::ios::end); + std::string buffer(fin.tellg(), ' '); + fin.seekg(0, std::ios::beg); + fin.read(&buffer[0], buffer.size()); + fin.close(); + framework::proto::ProgramDesc program_desc; + program_desc.ParseFromString(buffer); + return program_desc; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c429176424bd5c1d8fa5e015c19d698f966880e --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/model_store_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void ModelStorePass::Run(DataFlowGraph *x) { + if (!argument_->fluid_model_param_path) { + PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir); + argument_->fluid_model_param_path.reset( + new std::string(*argument_->fluid_model_dir + "param")); + } + PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path); + // Directly copy param file to destination. + std::stringstream ss; + // NOTE these commands only works on linux. + ss << "mkdir -p " << *argument_->model_output_store_path; + LOG(INFO) << "run command: " << ss.str(); + PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); + ss.str(""); + + ss << "cp " << *argument_->fluid_model_dir << "/*" + << " " << *argument_->model_output_store_path; + LOG(INFO) << "run command: " << ss.str(); + PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); + + // Store program + PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc, + "program desc is not transformed, should call " + "DataFlowGraphToFluidPass first."); + const std::string program_output_path = + *argument_->model_output_store_path + "/__model__"; + std::ofstream file(program_output_path, std::ios::binary); + PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.", + program_output_path); + const std::string serialized_message = + argument_->transformed_program_desc->SerializeAsString(); + file.write(serialized_message.c_str(), serialized_message.size()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..fac7083925776b6209d49255c9e67b930cb1250b --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass.h @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines ModelStorePass, which store the runtime DFG to a Paddle + * model in the disk, and that model can be reloaded for prediction. + */ + +#pragma once +#include +#include "paddle/fluid/inference/analysis/pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +class ModelStorePass : public DataFlowGraphPass { + public: + bool Initialize(Argument* argument) override { + if (!argument) { + LOG(ERROR) << "invalid argument"; + return false; + } + argument_ = argument; + return true; + } + + void Run(DataFlowGraph* x) override; + + std::string repr() const override { return "DFG-store-pass"; } + std::string description() const override { + return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle + model in the disk, and that model can be reloaded for prediction again.)DD"; + } + + private: + Argument* argument_{nullptr}; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..5f3526dd504e77e58d79b4f675db86a22fd0f26b --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/model_store_pass.h" + +#include +#include +#include "paddle/fluid/inference/analysis/analyzer.h" + +namespace paddle { +namespace inference { +namespace analysis { + +DEFINE_string(inference_model_dir, "", "Model path"); + +TEST(DFG_StorePass, test) { + Analyzer analyzer; + Argument argument(FLAGS_inference_model_dir); + argument.model_output_store_path.reset( + new std::string("./_dfg_store_pass_tmp")); + // disable storage in alalyzer + FLAGS_inference_analysis_output_storage_path = ""; + analyzer.Run(&argument); + + ModelStorePass pass; + pass.Initialize(&argument); + pass.Run(argument.main_dfg.get()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h index 6b4dbb3bb5ddd9f15f26758beef1d1b5bbf49142..6806f9ff7dada2c1e2328e1ffbfd225afefcf474 100644 --- a/paddle/fluid/inference/analysis/pass.h +++ b/paddle/fluid/inference/analysis/pass.h @@ -50,6 +50,7 @@ class Pass { // Create a debugger Pass that draw the DFG by graphviz toolkit. virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; } + virtual void Run() { LOG(FATAL) << "not valid"; } // Run on a single Node. virtual void Run(Node *x) { LOG(FATAL) << "not valid"; } // Run on a single Function. diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc index dac1c509d728114bd24a2ea1150c407646026fd4..13423e4837e12a96e7a5dfc9ca3f59bf8b14746a 100644 --- a/paddle/fluid/inference/analysis/pass_manager_tester.cc +++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc @@ -56,7 +56,7 @@ class TestNodePass final : public NodePass { std::string description() const override { return "some doc"; } }; -TEST_F(DFG_Tester, DFG_pass_manager) { +TEST(PassManager, DFG_pass_manager) { TestDfgPassManager manager; DFG_GraphvizDrawPass::Config config("./", "dfg.dot"); @@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) { manager.Register("graphviz", new DFG_GraphvizDrawPass(config)); manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass); + Argument argument(FLAGS_inference_model_dir); + ASSERT_TRUE(&argument); ASSERT_TRUE(manager.Initialize(&argument)); manager.RunAll(); } -TEST_F(DFG_Tester, Node_pass_manager) { +TEST(PassManager, Node_pass_manager) { + Argument argument(FLAGS_inference_model_dir); // Pre-process: initialize the DFG with the ProgramDesc first. FluidToDataFlowGraphPass pass0; pass0.Initialize(&argument); diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 67dd4da54b95add703428e1fded61065f60353e8..39cc433b40fad17f4f12359d4e907a250a88bd63 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) { return false; }; -TEST_F(DFG_Tester, Split) { - auto desc = LoadProgramDesc(); +TEST(SubGraphSplitter, Split) { + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); LOG(INFO) << "spliter\n" << dfg.DotString(); @@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) { ASSERT_EQ(subgraphs.back().size(), 6UL); } -TEST_F(DFG_Tester, Fuse) { - auto desc = LoadProgramDesc(); +TEST(SubGraphSplitter, Fuse) { + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); size_t count0 = dfg.nodes.size(); diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc index a6c15e848b99ca318f4583e3d4b88345fe8e5ebc..c1d932878e559180af987594535959afdf475587 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc @@ -22,11 +22,11 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) { +TEST(TensorRTSubgraphNodeMarkPass, test) { // init FluidToDataFlowGraphPass pass; + Argument argument(FLAGS_inference_model_dir); ASSERT_TRUE(pass.Initialize(&argument)); - argument.main_dfg.reset(new DataFlowGraph); pass.Run(argument.main_dfg.get()); TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) { @@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) { for (auto& node : argument.main_dfg->nodes.nodes()) { counter += node->attr(ATTR_supported_by_tensorrt).Bool(); } - + ASSERT_EQ(counter, 2); LOG(INFO) << counter << " nodes marked"; } diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc index 1d749d3fa3f39b351ccee6ebeb82467f7220a0b6..67a5af83d89b771536ea11be51b35244ff5c09d6 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc @@ -25,7 +25,7 @@ namespace analysis { DEFINE_string(dot_dir, "./", ""); -TEST_F(DFG_Tester, tensorrt_single_pass) { +TEST(TensorRTSubGraphPass, main) { std::unordered_set teller_set( {"elementwise_add", "mul", "sigmoid"}); SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) { @@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) { return false; }; - LOG(INFO) << "init"; + Argument argument(FLAGS_inference_model_dir); + DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"}; DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"}; @@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) { FluidToDataFlowGraphPass pass0; TensorRTSubGraphPass trt_pass(std::move(teller)); - LOG(INFO) << "Initialize"; dfg_pass.Initialize(&argument); dfg_pass1.Initialize(&argument); pass0.Initialize(&argument); trt_pass.Initialize(&argument); - LOG(INFO) << "Run"; argument.main_dfg.reset(new DataFlowGraph); pass0.Run(argument.main_dfg.get()); dfg_pass.Run(argument.main_dfg.get()); diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h index ce1191a567a4198f003520c40bf02487c48c56eb..1073a6f686eaeeaaae2d93ab044149b7df518085 100644 --- a/paddle/fluid/inference/analysis/ut_helper.h +++ b/paddle/fluid/inference/analysis/ut_helper.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/analysis/helper.h" namespace paddle { namespace inference { @@ -32,27 +32,12 @@ namespace analysis { DEFINE_string(inference_model_dir, "", "inference test model dir"); -static framework::proto::ProgramDesc LoadProgramDesc( - const std::string& model_dir = FLAGS_inference_model_dir) { - std::string msg; - std::string net_file = FLAGS_inference_model_dir + "/__model__"; - std::ifstream fin(net_file, std::ios::in | std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", net_file); - fin.seekg(0, std::ios::end); - msg.resize(fin.tellg()); - fin.seekg(0, std::ios::beg); - fin.read(&(msg.at(0)), msg.size()); - fin.close(); - framework::proto::ProgramDesc program_desc; - program_desc.ParseFromString(msg); - return program_desc; -} - static DataFlowGraph ProgramDescToDFG( const framework::proto::ProgramDesc& desc) { DataFlowGraph graph; FluidToDataFlowGraphPass pass; Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); pass.Initialize(&argument); pass.Run(&graph); @@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG( class DFG_Tester : public ::testing::Test { protected: void SetUp() override { - auto desc = LoadProgramDesc(FLAGS_inference_model_dir); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); } diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 3e60a61793339990648737c3d549d46cc5f5a887..259d79bedbf664f52b1189ca71567665a6d91180 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -19,6 +19,7 @@ endif(APPLE) set(inference_deps paddle_inference_api paddle_fluid_api) + if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) endif() @@ -63,6 +64,8 @@ endif() if (WITH_ANAKIN) # only needed in CI # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's, # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to + # compile the libinference_anakin_api.a and compile with anakin.so. + fetch_include_recursively(${ANAKIN_INCLUDE}) # compile the libinference_anakin_api.a and anakin.so. nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc) nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc) @@ -73,7 +76,7 @@ if (WITH_ANAKIN) # only needed in CI if (WITH_TESTING) cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin - DEPS inference_anakin_api) + DEPS inference_anakin_api_shared) target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endif(WITH_TESTING) endif() diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc index 0206ac60103759deda91be741617bde63e003de6..6b374ceefbc180a5c22abe591f12e1c3d89bc64a 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.cc +++ b/paddle/fluid/inference/api/api_anakin_engine.cc @@ -18,26 +18,36 @@ namespace paddle { -PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( +template +PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor( const AnakinConfig &config) { CHECK(Init(config)); } -bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { +template +bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { if (!(graph_.load(config.model_file))) { + LOG(FATAL) << "fail to load graph from " << config.model_file; return false; } - graph_.ResetBatchSize("input_0", config.max_batch_size); + auto inputs = graph_.get_ins(); + for (auto &input_str : inputs) { + graph_.ResetBatchSize(input_str, config.max_batch_size); + } // optimization for graph if (!(graph_.Optimize())) { return false; } // construct executer - executor_.init(graph_); + if (executor_p_ == nullptr) { + executor_p_ = new anakin::Net(graph_, true); + } return true; } -bool PaddleInferenceAnakinPredictor::Run( +template +bool PaddleInferenceAnakinPredictor::Run( const std::vector &inputs, std::vector *output_data, int batch_size) { for (const auto &input : inputs) { @@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run( << "'s type is not float"; return false; } - auto d_tensor_in_p = executor_.get_in(input.name); + auto d_tensor_in_p = executor_p_->get_in(input.name); + auto net_shape = d_tensor_in_p->valid_shape(); + if (net_shape.size() != input.shape.size()) { + LOG(ERROR) << " input " << input.name + << "'s shape size should be equal to that of net"; + return false; + } + int sum = 1; + for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; }); + if (sum > net_shape.count()) { + graph_.Reshape(input.name, input.shape); + delete executor_p_; + executor_p_ = new anakin::Net(graph_, true); + d_tensor_in_p = executor_p_->get_in(input.name); + } + + anakin::saber::Shape tmp_shape; + for (auto s : input.shape) { + tmp_shape.push_back(s); + } + d_tensor_in_p->reshape(tmp_shape); + float *d_data_p = d_tensor_in_p->mutable_data(); if (cudaMemcpy(d_data_p, static_cast(input.data.data()), d_tensor_in_p->valid_size() * sizeof(float), @@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run( } cudaStreamSynchronize(NULL); } - - executor_.prediction(); + cudaDeviceSynchronize(); + executor_p_->prediction(); + cudaDeviceSynchronize(); if (output_data->empty()) { LOG(ERROR) << "At least one output should be set with tensors' names."; return false; } for (auto &output : *output_data) { - auto *tensor = executor_.get_out(output.name); - output.shape = tensor->shape(); + auto *tensor = executor_p_->get_out(output.name); + output.shape = tensor->valid_shape(); if (output.data.length() < tensor->valid_size() * sizeof(float)) { output.data.Resize(tensor->valid_size() * sizeof(float)); } @@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run( return true; } -anakin::Net - &PaddleInferenceAnakinPredictor::get_executer() { - return executor_; +template +anakin::Net + &PaddleInferenceAnakinPredictor::get_executer() { + return *executor_p_; } // the cloned new Predictor of anakin share the same net weights from original // Predictor -std::unique_ptr PaddleInferenceAnakinPredictor::Clone() { +template +std::unique_ptr +PaddleInferenceAnakinPredictor::Clone() { VLOG(3) << "Anakin Predictor::clone"; - std::unique_ptr cls(new PaddleInferenceAnakinPredictor()); + std::unique_ptr cls( + new PaddleInferenceAnakinPredictor()); // construct executer from other graph auto anakin_predictor_p = - dynamic_cast(cls.get()); + dynamic_cast *>(cls.get()); if (!anakin_predictor_p) { LOG(ERROR) << "fail to call Init"; return nullptr; @@ -103,14 +140,28 @@ std::unique_ptr PaddleInferenceAnakinPredictor::Clone() { return std::move(cls); } +template class PaddleInferenceAnakinPredictor; +template class PaddleInferenceAnakinPredictor; + // A factory to help create difference predictor. template <> std::unique_ptr CreatePaddlePredictor< AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) { VLOG(3) << "Anakin Predictor create."; - std::unique_ptr x( - new PaddleInferenceAnakinPredictor(config)); - return x; -} + if (config.target_type == AnakinConfig::NVGPU) { + VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ]."; + std::unique_ptr x( + new PaddleInferenceAnakinPredictor(config)); + return x; + } else if (config.target_type == AnakinConfig::X86) { + VLOG(3) << "Anakin Predictor create on [ Intel X86 ]."; + std::unique_ptr x( + new PaddleInferenceAnakinPredictor(config)); + return x; + } else { + VLOG(3) << "Anakin Predictor create on unknown platform."; + return nullptr; + } +}; } // namespace paddle diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h index def096c867ec85624f5b221782ef8b6240923c05..836badd9799228c6c294dcad5df73d039d36a1ff 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -20,14 +20,16 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/inference/api/paddle_inference_api.h" -// from anakin #include "framework/core/net/net.h" +#include "framework/graph/graph.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "saber/core/shape.h" #include "saber/saber_types.h" namespace paddle { +template class PaddleInferenceAnakinPredictor : public PaddlePredictor { public: PaddleInferenceAnakinPredictor() {} @@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { std::unique_ptr Clone() override; - anakin::Net& + anakin::Net& get_executer(); - ~PaddleInferenceAnakinPredictor() override{}; + ~PaddleInferenceAnakinPredictor() override { + delete executor_p_; + executor_p_ = nullptr; + }; private: bool Init(const AnakinConfig& config); - anakin::graph::Graph + anakin::graph::Graph graph_; - anakin::Net - executor_; + anakin::Net* + executor_p_{nullptr}; AnakinConfig config_; }; diff --git a/paddle/fluid/inference/api/api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc index d6d631bfbad4278fe99e4553a410a9d9162dcc7b..62e820b68c79a47d963bb174663bfc8c4ac22de3 100644 --- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc +++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc @@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include +#include "gflags/gflags.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" -DEFINE_string(model, "", "Directory of the inference model."); +DEFINE_string(model, "", "Directory of the inference model(mobile_v2)."); namespace paddle { AnakinConfig GetConfig() { AnakinConfig config; + // using AnakinConfig::X86 if you need to use cpu to do inference + config.target_type = AnakinConfig::NVGPU; config.model_file = FLAGS_model; config.device = 0; config.max_batch_size = 1; @@ -36,28 +38,27 @@ TEST(inference, anakin) { CreatePaddlePredictor(config); float data[1 * 3 * 224 * 224] = {1.0f}; - - PaddleTensor tensor{.name = "input_0", - .shape = std::vector({1, 3, 224, 224}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor tensor; + tensor.name = "input_0"; + tensor.shape = std::vector({1, 3, 224, 224}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::FLOAT32; // For simplicity, we set all the slots with the same data. - std::vector paddle_tensor_feeds; - paddle_tensor_feeds.emplace_back(std::move(tensor)); + std::vector paddle_tensor_feeds(1, tensor); - PaddleTensor tensor_out{.name = "prob_out", - .shape = std::vector({1000, 1}), - .data = PaddleBuf(), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor tensor_out; + tensor_out.name = "prob_out"; + tensor_out.shape = std::vector({}); + tensor_out.data = PaddleBuf(); + tensor_out.dtype = PaddleDType::FLOAT32; - std::vector outputs; - outputs.emplace_back(std::move(tensor_out)); + std::vector outputs(1, tensor_out); ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); float* data_o = static_cast(outputs[0].data.data()); - for (size_t j = 0; j < 1000; ++j) { + for (size_t j = 0; j < outputs[0].data.length(); ++j) { LOG(INFO) << "output[" << j << "]: " << data_o[j]; } } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 58fd7c6f8b05a846bd4a82068f09f5d9ef5a6516..08d7af6d3af7054061b15b904c69b2862c629562 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), inputs[i].data.data(), inputs[i].data.length()); + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. + framework::LoD lod; + for (auto &level : inputs[i].lod) { + lod.emplace_back(level); + } + input.set_lod(lod); + feeds->push_back(input); } return true; @@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch( buffer.Resize(sizeof(float) * data.size()); } std::memcpy(buffer.data(), data.data(), buffer.length()); + // copy LoD + for (const auto &level : fetchs[i].lod()) { + outputs->at(i).lod.emplace_back(level); + } outputs->at(i).dtype = PaddleDType::FLOAT32; // TODO(panyx0718): support other types? fill tensor name? avoid a copy. } diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index c0891e9c281961fa03d278a0f5c676f92672c419..45b5a7638b7dc6a54bbd905766fd5c284cb6aea1 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { void OptimizeInferenceProgram() { // Analyze inference_program Argument argument; + if (!config_.model_dir.empty()) { + argument.fluid_model_dir.reset(new std::string(config_.model_dir)); + } else { + PADDLE_ENFORCE( + !config_.param_file.empty(), + "Either model_dir or (param_file, prog_file) should be set."); + PADDLE_ENFORCE(!config_.prog_file.empty()); + argument.fluid_model_program_path.reset( + new std::string(config_.prog_file)); + argument.fluid_model_param_path.reset( + new std::string(config_.param_file)); + } argument.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); Singleton::Global().Run(&argument); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index 62d98a796708612e7d4ff8abfd85125978ce22c7..fcbf9b89d608e7961e3ef81ac1c70e083dae1cc0 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { std::vector data(20); for (int i = 0; i < 20; i++) data[i] = i; - PaddleTensor tensor{ - .name = "", - .shape = std::vector({10, 1}), - .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({10, 1}); + tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)); + tensor.dtype = PaddleDType::INT64; // For simplicity, we set all the slots with the same data. std::vector slots(4, tensor); diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc index 5f96fecf93f7a6c42bc6b9fe4e0d985c626388d7..03ac79e9edf0d7ce6e167c3d34af5ba84bbc0e72 100644 --- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc +++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc @@ -47,10 +47,10 @@ void Main(bool use_gpu) { //# 2. Prepare input. int64_t data[4] = {1, 2, 3, 4}; - PaddleTensor tensor{.name = "", - .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; // For simplicity, we set all the slots with the same data. std::vector slots(4, tensor); @@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) { for (int batch_id = 0; batch_id < num_batches; ++batch_id) { // 2. Dummy Input Data int64_t data[4] = {1, 2, 3, 4}; - PaddleTensor tensor{.name = "", - .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + std::vector inputs(4, tensor); std::vector outputs; // 3. Run diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index 0a2a2b713ab21a3124d8a85ba469f64278623ec4..3800d49b34738d5a272033d75cb415ae9ad1fb8f 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -20,8 +20,8 @@ limitations under the License. */ #include // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files. #include #include +#include "paddle/fluid/inference/demo_ci/utils.h" #include "paddle/fluid/platform/enforce.h" -#include "utils.h" #ifdef PADDLE_WITH_CUDA DECLARE_double(fraction_of_gpu_memory_to_use); @@ -123,11 +123,11 @@ void Main(bool use_gpu) { file.close(); // Inference. - PaddleTensor input{ - .name = "xx", - .shape = record.shape, - .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; VLOG(3) << "run executor"; std::vector output; diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 2f8b4f8596946988a728b5cf82de251bfda778a9..59b0df7968cce137843ba8cad38a62fdb8d3bfc1 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -44,7 +44,7 @@ class PaddleBuf { PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} // Own memory. - PaddleBuf(size_t length) + explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} // Resize to `length` bytes. void Resize(size_t length); @@ -67,9 +67,9 @@ struct PaddleTensor { PaddleTensor() = default; std::string name; // variable name. std::vector shape; - // TODO(Superjomn) for LoD support, add a vector> field if needed. PaddleBuf data; // blob of data. PaddleDType dtype; + std::vector> lod; // lod data }; enum class PaddleEngineKind { @@ -126,9 +126,11 @@ struct NativeConfig : public PaddlePredictor::Config { // Configurations for Anakin engine. struct AnakinConfig : public PaddlePredictor::Config { + enum TargetType { NVGPU = 0, X86 }; int device; std::string model_file; int max_batch_size{-1}; + TargetType target_type; }; struct TensorRTConfig : public NativeConfig { diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 3864f337bdadc61e7531304e2cf2ee52a25253f2..d86c046f2e5b08a4c00cf6cad19627e6a196c798 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,6 +1,6 @@ # Add TRT tests nv_library(tensorrt_converter - SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc + SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc DEPS tensorrt_engine operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -13,6 +13,10 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL) - +nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL) nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL) + +nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 8e7e23377d4b2fe7afd51f1f58048fc4ed3c6d99..dba1d50b2d1c487ced8e6ca51f2d257641ad5fc7 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -20,11 +20,60 @@ namespace tensorrt { class Conv2dOpConverter : public OpConverter { public: - Conv2dOpConverter() {} void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { LOG(INFO) << "convert a fluid conv2d op to tensorrt conv layer without bias"; + + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1); + + auto* X = engine_->GetITensor(op_desc.Input("Input").front()); + // Declare weights + auto* Y_v = scope.FindVar(op_desc.Input("Filter").front()); + PADDLE_ENFORCE_NOT_NULL(Y_v); + auto* Y_t = Y_v->GetMutable(); + auto* weight_data = Y_t->mutable_data(platform::CPUPlace()); + + PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL); + const int n_output = Y_t->dims()[0]; + const int filter_h = Y_t->dims()[2]; + const int filter_w = Y_t->dims()[3]; + + const int groups = boost::get(op_desc.GetAttr("groups")); + const std::vector dilations = + boost::get>(op_desc.GetAttr("dilations")); + const std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + const std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + + nvinfer1::DimsHW nv_ksize(filter_h, filter_w); + nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]); + nvinfer1::DimsHW nv_strides(strides[0], strides[1]); + nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); + + TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + Y_t->memory_size() / sizeof(float)}; + + TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + auto* layer = TRT_ENGINE_ADD_LAYER( + engine_, Convolution, *const_cast(X), n_output, + nv_ksize, weight.get(), bias.get()); + PADDLE_ENFORCE(layer != nullptr); + layer->setStride(nv_strides); + layer->setPadding(nv_paddings); + layer->setDilation(nv_dilations); + layer->setNbGroups(groups); + + auto output_name = op_desc.Output("Output").front(); + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } } }; diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3744550f60a1696aedd8a3ecd24f1b21d22325b9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -0,0 +1,210 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class ElementwiseWeightOpConverter : public OpConverter { + public: + ElementwiseWeightOpConverter() {} + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + // Here the two nullptr looks strange, that's because the + // framework::OpDesc's constructor is strange. + framework::OpDesc op_desc(op, nullptr); + LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer"; + + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto* X = engine_->GetITensor(op_desc.Input("X").front()); + nvinfer1::Dims dims_x = X->getDimensions(); + PADDLE_ENFORCE(dims_x.nbDims >= 3); + + auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); + PADDLE_ENFORCE_NOT_NULL(Y_v); + auto* Y_t = Y_v->GetMutable(); + auto* weight_data = Y_t->mutable_data(platform::CPUPlace()); + auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; + + std::vector dims_y = framework::vectorize2int(Y_t->dims()); + if (static_cast(dims_y.size()) == dims_x.nbDims + 1) { + if (dims_y[0] == 1) dims_y.erase(dims_y.begin()); + } + + if (static_cast(dims_y.size()) == 1 && dims_y[0] == dims_x.d[0]) { + scale_mode = nvinfer1::ScaleMode::kCHANNEL; + } else if (static_cast(dims_y.size()) == dims_x.nbDims && + dims_y[0] == dims_x.d[0]) { + scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; + for (int i = 1; i < dims_x.nbDims; i++) { + if (dims_y[i] != dims_x.d[i]) { + scale_mode = nvinfer1::ScaleMode::kCHANNEL; + break; + } + } + if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) { + for (int i = 1; i < dims_x.nbDims; i++) { + if (dims_y[i] != 1) + PADDLE_THROW( + "TensorRT unsupported weight shape for Elementwise op!"); + } + } + } else { + PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!"); + } + + TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + Y_t->memory_size() / sizeof(float)}; + TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + + nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *const_cast(X), scale_mode, + shift_weights.get(), scale_weights.get(), power_weights.get()); + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } + } +}; + +class ElementwiseTensorOpConverter : public OpConverter { + public: + ElementwiseTensorOpConverter() {} + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + // Here the two nullptr looks strange, that's because the + // framework::OpDesc's constructor is strange. + framework::OpDesc op_desc(op, nullptr); + LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer"; + + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto* X = engine_->GetITensor(op_desc.Input("X").front()); + auto* Y = engine_->GetITensor(op_desc.Input("Y").front()); + nvinfer1::Dims dims_x = X->getDimensions(); + nvinfer1::Dims dims_y = Y->getDimensions(); + + // The two input tensor should have the same dims + PADDLE_ENFORCE(dims_x.nbDims >= 3); + if (dims_x.nbDims == dims_y.nbDims) { + for (int i = 0; i < dims_x.nbDims; i++) { + if (dims_x.d[i] != dims_y.d[i]) + PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!"); + } + } else { + PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!"); + } + + auto op_pair = ops.find(op_type_); + if (op_pair == ops.end()) { + PADDLE_THROW("Wrong elementwise op type!"); + } + nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *const_cast(X), + *const_cast(Y), op_pair->second); + + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } + } + + protected: + static const std::unordered_map + ops; + std::string op_type_; +}; + +const std::unordered_map + ElementwiseTensorOpConverter::ops = { + {"add", nvinfer1::ElementWiseOperation::kSUM}, + {"mul", nvinfer1::ElementWiseOperation::kPROD}, + {"sub", nvinfer1::ElementWiseOperation::kSUB}, + {"div", nvinfer1::ElementWiseOperation::kDIV}, + {"min", nvinfer1::ElementWiseOperation::kMIN}, + {"pow", nvinfer1::ElementWiseOperation::kPOW}, + {"max", nvinfer1::ElementWiseOperation::kMAX}, +}; + +class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorAddOpConverter() { op_type_ = "add"; } +}; + +class ElementwiseTensorMulOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorMulOpConverter() { op_type_ = "mul"; } +}; + +class ElementwiseTensorSubOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorSubOpConverter() { op_type_ = "sub"; } +}; + +class ElementwiseTensorDivOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorDivOpConverter() { op_type_ = "div"; } +}; + +class ElementwiseTensorMinOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorMinOpConverter() { op_type_ = "min"; } +}; + +class ElementwiseTensorMaxOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorMaxOpConverter() { op_type_ = "max"; } +}; + +class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorPowOpConverter() { op_type_ = "pow"; } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter); + +REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor, + ElementwiseTensorAddOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_sub_tensor, + ElementwiseTensorSubOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_div_tensor, + ElementwiseTensorDivOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_mul_tensor, + ElementwiseTensorMulOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_max_tensor, + ElementwiseTensorMaxOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor, + ElementwiseTensorMinOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor, + ElementwiseTensorPowOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 409efac6799b6fb8d27a1343a55e7a508760868f..39fe1f609d7b94638506877fc301f19ef33ec8ac 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, } // indata c * k // Reorder the data layout from CK to KC. -void ReorderCKtoKC(TensorRTEngine::Weight& iweights, +void ReorderCKtoKC(TensorRTEngine::Weight& iweights, // NOLINT TensorRTEngine::Weight* oweights) { int c = iweights.dims[0]; int k = iweights.dims[1]; diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 968f7eb99ce8519edaa585fd3cb642bd80cc63cc..1b6a0ad82f3ceb00cec15c28c8121adc22271b7a 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -55,6 +55,31 @@ class OpConverter { it = Registry::Lookup("fc"); } } + + if (op_desc.Type().find("elementwise") != std::string::npos) { + static std::unordered_set add_tensor_op_set{ + "add", "mul", "sub", "div", "max", "min", "pow"}; + // TODO(xingzhaolong): all mul, sub, div + // static std::unordered_set add_weight_op_set {"add", "mul", + // "sub", "div"}; + static std::unordered_set add_weight_op_set{"add"}; + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); + int op_type_len = op_desc.Type().size(); + std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len); + std::string Y = op_desc.Input("Y")[0]; + if (parameters.count(Y)) { + PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0, + "Unsupported elementwise type" + op_type); + it = + Registry::Lookup("elementwise_" + op_type + "_weight"); + } else { + PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0, + "Unsupported elementwise type" + op_type); + it = + Registry::Lookup("elementwise_" + op_type + "_tensor"); + } + } + if (!it) { it = Registry::Lookup(op_desc.Type()); } diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f8711c6b60d74639529624c25429bc245de46479 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(conv2d_op, test) { + std::unordered_set parameters({"conv2d-Y"}); + framework::Scope scope; + TRTConvertValidation validator(5, parameters, scope, 1 << 15); + + validator.DeclInputVar("conv2d-X", nvinfer1::Dims3(2, 5, 5)); + validator.DeclParamVar("conv2d-Y", nvinfer1::Dims4(3, 2, 3, 3)); + validator.DeclOutputVar("conv2d-Out", nvinfer1::Dims3(3, 5, 5)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("conv2d"); + desc.SetInput("Input", {"conv2d-X"}); + desc.SetInput("Filter", {"conv2d-Y"}); + desc.SetOutput("Output", {"conv2d-Out"}); + + const std::vector strides({1, 1}); + const std::vector paddings({1, 1}); + const std::vector dilations({1, 1}); + const int groups = 1; + + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + desc.SetAttr("dilations", dilations); + desc.SetAttr("groups", groups); + + validator.SetOp(*desc.Proto()); + + validator.Execute(3); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle +USE_OP(conv2d); diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7537d02a35b66a41c158cd8eb1b1e5d4107e7d84 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(elementwise_op, add_weight_test) { + std::unordered_set parameters({"elementwise_add-Y"}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1 << 15); + validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); + validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1)); + // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); + validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("elementwise_add"); + desc.SetInput("X", {"elementwise_add-X"}); + desc.SetInput("Y", {"elementwise_add-Y"}); + desc.SetOutput("Out", {"elementwise_add-Out"}); + + int axis = 1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + + validator.Execute(8); +} + +TEST(elementwise_op, add_tensor_test) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(8, parameters, scope, 1 << 15); + validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); + validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3)); + // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); + validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("elementwise_add"); + desc.SetInput("X", {"elementwise_add-X"}); + desc.SetInput("Y", {"elementwise_add-Y"}); + desc.SetOutput("Out", {"elementwise_add-Out"}); + + // the defalut axis of elementwise op is -1 + + validator.SetOp(*desc.Proto()); + + validator.Execute(8); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle +USE_OP(elementwise_add); diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index 9b79f86b0edba983019bd932f52b08711ff36d41..d6651a5b244ba31a01220e6299cb2016ae61fe64 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) { framework::ProgramDesc prog; auto* block = prog.MutableBlock(0); auto* conv2d_op = block->AppendOp(); + + // init trt engine + cudaStream_t stream_; + std::unique_ptr engine_; + engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_)); + engine_->InitNetwork(); + PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + + engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3(2, 5, 5)); + conv2d_op->SetType("conv2d"); + conv2d_op->SetInput("Input", {"conv2d-X"}); + conv2d_op->SetInput("Filter", {"conv2d-Y"}); + conv2d_op->SetOutput("Output", {"conv2d-Out"}); - OpConverter converter; + const std::vector strides({1, 1}); + const std::vector paddings({1, 1}); + const std::vector dilations({1, 1}); + const int groups = 1; + + conv2d_op->SetAttr("strides", strides); + conv2d_op->SetAttr("paddings", paddings); + conv2d_op->SetAttr("dilations", dilations); + conv2d_op->SetAttr("groups", groups); + + // init scope framework::Scope scope; - converter.ConvertBlock(*block->Proto(), {}, scope, - nullptr /*TensorRTEngine*/); + std::vector dim_vec = {3, 2, 3, 3}; + auto* x = scope.Var("conv2d-Y"); + auto* x_tensor = x->GetMutable(); + x_tensor->Resize(framework::make_ddim(dim_vec)); + + OpConverter converter; + converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope, + engine_.get() /*TensorRTEngine*/); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 39529cc2c799212f91107b1b86dd2c8c3642b6da..63c2f978f253df11100ecca83acae5eab6a0337d 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -149,7 +149,7 @@ class TRTConvertValidation { cudaStreamSynchronize(*engine_->stream()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); - const size_t output_space_size = 2000; + const size_t output_space_size = 3000; for (const auto& output : op_desc_->OutputArgumentNames()) { std::vector fluid_out; std::vector trt_out(output_space_size); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index b3781ded01c09edd59df09fd064b37052ad0333a..22cbf680c0670552fb014043c69fcadc56863529 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -20,10 +20,10 @@ limitations under the License. */ #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" -DEFINE_bool(cudnn_deterministic, true, +DEFINE_bool(cudnn_deterministic, false, "Whether allow using an autotuning algorithm for convolution " "operator. The autotuning algorithm may be non-deterministic. If " - "false, the algorithm is deterministic."); + "true, the algorithm is deterministic."); namespace paddle { namespace operators { @@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); if (input_grad) { - if (FLAGS_cudnn_deterministic) { + if (!FLAGS_cudnn_deterministic) { CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, @@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } if (filter_grad) { - if (FLAGS_cudnn_deterministic) { + if (!FLAGS_cudnn_deterministic) { CUDNN_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( handle, cudnn_input_desc, cudnn_output_grad_desc, diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 5098bd8700e11c9a2faeba90c38ed2d9499b17cf..0511eb42a073ac305634110a71a35e501f062132 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireSrcMemoryFromWeightsPrimitive( const std::shared_ptr user_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto src_pd = conv_bwd_weights_pd_->src_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc(); return this->AcquireMemory(src_pd, user_pd, user_memory_p, @@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireDiffDstMemoryFromWeightsPrimitive( const std::shared_ptr user_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc(); return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, @@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireDiffDstMemoryFromDataPrimitive( const std::shared_ptr user_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc(); return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, @@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromDataPrimitive( const std::shared_ptr user_weights_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc(); auto user_pd = user_weights_memory_p->get_primitive_desc(); return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p, @@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireSrcMemoryFromPrimitive( const std::shared_ptr user_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto src_pd = conv_pd_->src_primitive_desc(); auto user_pd = user_memory_p->get_primitive_desc(); return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", @@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, - std::vector& pipeline) { + std::vector& pipeline) { // NOLINT auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc(); return this->AcquireMemory(weights_pd, user_weights_pd, @@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { // Generate keys for storing/retriving primitives for this operator // TODO(jczaja): Make hashing function more optimial - static std::string GetHash(memory::dims& input_dims, - memory::dims& weights_dims, - std::vector& strides, - std::vector& paddings, - std::vector& dilations, int groups, - const std::string& suffix) { + static std::string GetHash(memory::dims& input_dims, // NOLINT + memory::dims& weights_dims, // NOLINT + std::vector& strides, // NOLINT + std::vector& paddings, // NOLINT + std::vector& dilations, // NOLINT + int groups, const std::string& suffix) { return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) + dims2str(paddings) + dims2str(dilations) + std::to_string(groups) + suffix; diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc index 1a5427b39241b666eeaf12b173ea00443bb5f6e4..c86cd57316078778e5930c9b524b931d523028d7 100644 --- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc @@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { int axis = ctx.Attr("axis"); auto x_dims = x->dims(); - auto y_dims = y->dims(); + auto y_dims_untrimed = y->dims(); auto z_dims = z->dims(); // Execute default elementwise_add operator when // broadcast operations need to performed. - if (x_dims != y_dims) { + if (x_dims != y_dims_untrimed) { auto sum_func = [](T a, T b) -> T { return a + b; }; TransformFunctor { ctx.template device_context(), sum_func); - axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis); PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), "Axis should be in range [0, x_dims)"); - trim_trailing_singular_dims(&y_dims); + auto y_dims = trim_trailing_singular_dims(y_dims_untrimed); axis = (y_dims.size() == 0) ? x_dims.size() : axis; int pre, n, post; @@ -88,7 +88,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { "Wrong layout/format set for Y tensor"); std::vector src_x_tz = framework::vectorize2int(x_dims); - std::vector src_y_tz = framework::vectorize2int(y_dims); + std::vector src_y_tz = framework::vectorize2int(y_dims_untrimed); std::vector dst_tz = framework::vectorize2int(z_dims); std::vector srcs_pd; @@ -142,36 +142,39 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { using Tensor = framework::Tensor; - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); + // skip out, x, y, + // dout length is larger or equal than dx, dy. + auto* out = dout; + auto *x = dout, *y = dout; auto set_mkldnn_format = [](Tensor* in, const Tensor* out) { in->set_layout(DataLayout::kMKLDNN); in->set_format(out->format()); }; - if (x->dims() == y->dims()) { - auto blas = math::GetBlas(ctx); - if (dx) { - blas.VCOPY(dout->numel(), dout->data(), - dx->mutable_data(ctx.GetPlace())); - set_mkldnn_format(dx, dout); - } - - if (dy) { - blas.VCOPY(dout->numel(), dout->data(), - dy->mutable_data(ctx.GetPlace())); - set_mkldnn_format(dy, dout); + if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) { + if (dx->dims() == dy->dims()) { + auto blas = math::GetBlas(ctx); + if (dx) { + blas.VCOPY(dout->numel(), dout->data(), + dx->mutable_data(ctx.GetPlace())); + set_mkldnn_format(dx, dout); + } + + if (dy) { + blas.VCOPY(dout->numel(), dout->data(), + dy->mutable_data(ctx.GetPlace())); + set_mkldnn_format(dy, dout); + } } } else { // Execute default kernel when broadcast is needed - ElemwiseGradCompute, IdentityGrad>( + ElemwiseExplicitGradCompute, IdentityGrad>( ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad(), IdentityGrad()); } diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc index d2c20537136fc3ac9d1bece24a2238f26215c922..3c97ac995c649ecd0d196a584240e1e7ac04f08e 100644 --- a/paddle/fluid/operators/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise_add_op.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise_op.h" namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_add, "Add", "Out = X + Y"); +REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add); +REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out", + "X"); REGISTER_OP_CPU_KERNEL( elementwise_add, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h index baf04c30b17cb333fc8a6544afd6c479442f835b..5356105e2e551c0528694091608fc7585dce66d2 100644 --- a/paddle/fluid/operators/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise_add_op.h @@ -95,9 +95,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx, framework::Tensor* dy) { int axis = ctx.Attr("axis"); - ElemwiseGradCompute, IdentityGrad>( - ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad(), - IdentityGrad()); + ElemwiseExplicitGradCompute, + IdentityGrad>(ctx, *x, *y, *out, *dout, axis, + dx, dy, IdentityGrad(), + IdentityGrad()); } template @@ -140,14 +141,15 @@ class ElementwiseAddGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { using Tensor = framework::Tensor; - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + // skip out, x, y + auto* out = dout; + auto *x = dout, *y = dout; - if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) { + if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr && + dy != nullptr && (dx->dims() == dy->dims())) { elementwise_add_grad(ctx, x, y, out, dout, dx, dy); } else { default_elementwise_add_grad(ctx, x, y, out, dout, dx, diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc index 824b1221e5a77c8799dc34820b7f0db180c2439e..84c8a65e5f859d276ae6d5f1a3f25c9d713a7a61 100644 --- a/paddle/fluid/operators/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise_div_op.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise_div_op.h" #include "paddle/fluid/operators/elementwise_op.h" namespace ops = paddle::operators; + REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y"); + REGISTER_OP_CPU_KERNEL( elementwise_div, ops::ElementwiseDivKernel, diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index bb88970e42c194d9437609b62435f1a89e2b446b..d8a12e800ad733800c1ec333f15d31d4dcd1a3a5 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { void Make() final { AddInput("X", "(Tensor), The first input tensor of elementwise op."); AddInput("Y", "(Tensor), The second input tensor of elementwise op."); - AddOutput("Out", "The output of elementwise op.").Reuse("X"); + // AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save + // memory.").AsIntermediate(); + AddOutput("Out", "The output of elementwise op."); AddAttr("axis", "(int, default -1). The start dimension index " "for broadcasting Y onto X.") @@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$. )DOC", GetName(), GetEquation())); + SetReuse(); } protected: virtual std::string GetName() const = 0; virtual std::string GetEquation() const = 0; + virtual void SetReuse() {} }; class ElementwiseOpGrad : public framework::OperatorWithKernel { @@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); + auto input_data_type = framework::ToDataType( + ctx.Input(framework::GradVarName("Out"))->type()); #ifdef PADDLE_WITH_MKLDNN if (platform::CanMKLDNNBeUsed(ctx)) { @@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; + +// For Add, Sub op, the X, Out is not needed. +class ElementwiseOpExplicitGrad : public ElementwiseOpGrad { + public: + using operators::ElementwiseOpGrad::ElementwiseOpGrad; + using operators::ElementwiseOpGrad::GetExpectedKernelType; + using Tensor = framework::Tensor; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(x_grad_name, out_dims); + } + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(y_grad_name)) { + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + auto y_dims = ctx->GetInputDim("Y"); + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + } // namespace operators } // namespace paddle +/* +*/ + +#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name) \ + class kernel_type##GradMaker \ + : public paddle::framework::SingleGradOpDescMaker { \ + public: \ + using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \ + \ + protected: \ + std::unique_ptr Apply() const override { \ + auto* op = new paddle::framework::OpDesc(); \ + op->SetType(#kernel_type "_grad"); \ + op->SetInput("Y", Input("Y")); \ + op->SetInput(::paddle::framework::GradVarName("Out"), \ + OutputGrad("Out")); \ + op->SetAttrMap(Attrs()); \ + op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X")); \ + op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y")); \ + return std::unique_ptr<::paddle::framework::OpDesc>(op); \ + } \ + } + #define REGISTER_ELEMWISE_OP(op_type, op_name, equation) \ class __ElemwiseOp##op_type##Maker__ \ : public ::paddle::operators::ElementwiseOpMaker { \ @@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ::paddle::operators::ElementwiseOpInferVarType, \ ::paddle::framework::DefaultGradOpDescMaker); \ REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad) + +#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \ + class __ElemwiseOp##op_type##Maker__ \ + : public ::paddle::operators::ElementwiseOpMaker { \ + protected: \ + virtual std::string GetName() const { return op_name; } \ + virtual std::string GetEquation() const { return equation; } \ + virtual void SetReuse() { Reuse(__VA_ARGS__); } \ + }; \ + REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \ + __ElemwiseOp##op_type##Maker__, \ + ::paddle::operators::ElementwiseOpInferVarType, \ + op_type##GradMaker); \ + REGISTER_OPERATOR(op_type##_grad, \ + ::paddle::operators::ElementwiseOpExplicitGrad) diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index 8b052611f80ddf874ca48c1c58e13346528a834e..eb8272e90c32c3a0be2c0ce1bc679571af876317 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims, } } -inline void trim_trailing_singular_dims(framework::DDim* dims) { +inline framework::DDim trim_trailing_singular_dims( + const framework::DDim& dims) { // Remove trailing dimensions of size 1 for y - auto actual_dims_size = dims->size(); + auto actual_dims_size = dims.size(); for (; actual_dims_size != 0; --actual_dims_size) { - if ((*dims)[actual_dims_size - 1] != 1) break; + if (dims[actual_dims_size - 1] != 1) break; } - if (actual_dims_size != dims->size()) { - auto actual_dims = framework::vectorize(*dims); - actual_dims.resize(actual_dims_size); - *dims = framework::make_ddim(actual_dims); + + std::vector trim_dims; + trim_dims.resize(actual_dims_size); + for (int i = 0; i < actual_dims_size; ++i) { + trim_dims[i] = dims[i]; } + framework::DDim actual_dims = framework::make_ddim(trim_dims); + return actual_dims; } template @@ -456,6 +462,71 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x, #endif +template +void ElemwiseGradComputeNoBroadcast( + const framework::ExecutionContext& ctx, const framework::DDim& x_dim, + const framework::DDim& y_dim, const framework::Tensor& x, + const framework::Tensor& y, const framework::Tensor& out, + const framework::Tensor& dout, int axis, framework::Tensor* dx, + framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) { + size_t N = static_cast(framework::product(x_dim)); + platform::ForRange for_range( + ctx.template device_context(), N); + for_range(ElemwiseGradNoBroadcast{ + x.data(), y.data(), out.data(), dout.data(), dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())}); +} + +template +void ElemwiseGradComputeWithBroadcast( + const framework::ExecutionContext& ctx, const framework::DDim& x_dim, + const framework::DDim& y_dim_untrimed, const framework::Tensor& x, + const framework::Tensor& y, const framework::Tensor& out, + const framework::Tensor& dout, int axis, framework::Tensor* dx, + framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) { + axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis); + auto y_dim = trim_trailing_singular_dims(y_dim_untrimed); + axis = (y_dim.size() == 0) ? x_dim.size() : axis; + + int pre, n, post; + get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); + if (post == 1) { + int h = pre; + int w = n; + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef __NVCC__ + ElemwiseGradBroadcast1CUDA( + ctx.template device_context().stream(), x.data(), + y.data(), out.data(), dout.data(), h, w, dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); +#endif + } else { + ElemwiseGradBroadcast1CPU( + x.data(), y.data(), out.data(), dout.data(), h, w, dx_op, + dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + } + } else { + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef __NVCC__ + ElemwiseGradBroadcast2CUDA( + ctx.template device_context().stream(), x.data(), + y.data(), out.data(), dout.data(), pre, n, post, dx_op, + dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); +#endif + } else { + ElemwiseGradBroadcast2CPU( + x.data(), y.data(), out.data(), dout.data(), pre, n, post, + dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + } + } +} + template void ElemwiseGradCompute(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& y, @@ -463,63 +534,50 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, const framework::Tensor& dout, int axis, framework::Tensor* dx, framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) { + const framework::DDim x_dim = x.dims(); + const framework::DDim y_dim = y.dims(); if (x.dims() == y.dims()) { - size_t N = static_cast(framework::product(x.dims())); - platform::ForRange for_range( - ctx.template device_context(), N); - for_range(ElemwiseGradNoBroadcast{ - x.data(), y.data(), out.data(), dout.data(), dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())}); + ElemwiseGradComputeNoBroadcast( + ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } else { // Y is a scalar - auto x_dim = x.dims(); - auto y_dim = y.dims(); - - axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis); - trim_trailing_singular_dims(&y_dim); - axis = (y_dim.size() == 0) ? x_dim.size() : axis; - - int pre, n, post; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); - if (post == 1) { - int h = pre; - int w = n; - if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ - ElemwiseGradBroadcast1CUDA( - ctx.template device_context().stream(), x.data(), - y.data(), out.data(), dout.data(), h, w, dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); -#endif - } else { - ElemwiseGradBroadcast1CPU( - x.data(), y.data(), out.data(), dout.data(), h, w, - dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); - } - } else { - if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ - ElemwiseGradBroadcast2CUDA( - ctx.template device_context().stream(), x.data(), - y.data(), out.data(), dout.data(), pre, n, post, dx_op, - dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); -#endif - } else { - ElemwiseGradBroadcast2CPU( - x.data(), y.data(), out.data(), dout.data(), pre, n, - post, dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); - } + ElemwiseGradComputeWithBroadcast( + ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + } +} + +// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub. +// explicit gradient can cut off X, Y, Out from gradient op +// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse +// elementwise code. +template +void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx, + const framework::Tensor& x, + const framework::Tensor& y, + const framework::Tensor& out, + const framework::Tensor& dout, int axis, + framework::Tensor* dx, framework::Tensor* dy, + DX_OP dx_op, DY_OP dy_op) { + if (dy == nullptr) { + const framework::DDim dx_dims = dout.dims(); + auto dy_dims = dx_dims; + ElemwiseGradComputeNoBroadcast( + ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + } else { + if (dout.dims() == dy->dims()) { + const framework::DDim dx_dims = dout.dims(); + const framework::DDim dy_dims = dy->dims(); + ElemwiseGradComputeNoBroadcast( + ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + } else { // Y is a scalar + auto dx_dims = dout.dims(); + const framework::DDim dy_dims = dy->dims(); + ElemwiseGradComputeWithBroadcast( + ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } } } +// Deprecated template void ElementwiseGradCompute(const framework::ExecutionContext& ctx, @@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx, } axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); - trim_trailing_singular_dims(&y_dims); + trim_trailing_singular_dims(y_dims); axis = (y_dims.size() == 0) ? x_dims.size() : axis; int pre, n, post; @@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx, x, y, z, ctx.template device_context(), func); auto x_dims = x->dims(); - auto y_dims = y->dims(); - PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + auto y_dims_untrimed = y->dims(); + PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(), "Rank of first input must >= rank of second input."); - if (x_dims == y_dims) { + if (x_dims == y_dims_untrimed) { functor.Run(); return; } - axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis); PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), "Axis should be in range [0, x_dims)"); - trim_trailing_singular_dims(&y_dims); + auto y_dims = trim_trailing_singular_dims(y_dims_untrimed); axis = (y_dims.size() == 0) ? x_dims.size() : axis; int pre, n, post; diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc index a7562b166b373ee2a8c9b6f379431d88d3e45fcb..b7224261e6a7ca82dff92a25f5fe8818c08e676d 100644 --- a/paddle/fluid/operators/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise_sub_op.cc @@ -15,7 +15,10 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise_op.h" namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_sub, "Sub", "Out = X - Y"); +REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub); +REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out", + "X"); + REGISTER_OP_CPU_KERNEL( elementwise_sub, ops::ElementwiseSubKernel, diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h index fe088b8203722a43b9aba7be3878b8f4ca68ba12..11c7e3fe628001f095836a788f2bcc7c4ee7ad4b 100644 --- a/paddle/fluid/operators/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise_sub_op.h @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { using Tensor = framework::Tensor; - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElemwiseGradCompute, SubGradDY>( + // skip out, x, y + auto* out = dout; + auto *x = dout, *y = dout; + + ElemwiseExplicitGradCompute, SubGradDY>( ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX(), SubGradDY()); } }; diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 31a7458f637921c290fc71ac748143867b4aae19..fefc7125b4de7274589670d29be4511469d5064a 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { ctx->GetInputDim(framework::GradVarName("Out")), "Input(Out) and its gradients should have a same shape."); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); } protected: @@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { layout_ = framework::DataLayout::kMKLDNN; } #endif - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); + auto input_data_type = framework::ToDataType( + ctx.Input(framework::GradVarName("Out"))->type()); if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "float16 can only be used on GPU place"); @@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { } }; +class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("softmax_grad"); + + op->SetInput("Out", Output("Out")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return std::unique_ptr(op); + } +}; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::SoftmaxOpGradMaker); REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); REGISTER_OP_CPU_KERNEL( softmax, ops::SoftmaxKernel, diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 0f334b2892d77b836728cf79898d1832e90e7c00..a8bc16f1b5b9b624e88e355d8ce4741fcec34bc3 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -534,7 +534,7 @@ EOF make -j `nproc` inference_lib_dist cd ${PADDLE_ROOT}/build cp -r fluid_install_dir fluid - tar -cf fluid.tgz fluid + tar -czf fluid.tgz fluid fi } diff --git a/python/paddle/batch.py b/python/paddle/batch.py index d48c54fcbb66487617b1946bc69724870c8f879c..008509660739d61245526278735064472b8b06dd 100644 --- a/python/paddle/batch.py +++ b/python/paddle/batch.py @@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False): if drop_last == False and len(b) != 0: yield b + # Batch size check + batch_size = int(batch_size) + if batch_size <= 0: + raise ValueError("batch_size should be a positive integeral value, " + "but got batch_size={}".format(batch_size)) + return batch_reader diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 3ef4afa691b1dfba07fb132753f380727bb4f3ae..60a12686f8ff43f5ee7e30650a208296963bda3d 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -37,6 +37,7 @@ __all__ = [ __auto__ = [ 'iou_similarity', 'box_coder', + 'polygon_box_transform', ] __all__ += __auto__ diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 9e97ec9a6f55680a2eb44ad712ac002df4fecda5..01db8645b3aff77371f01c3dec51c85f99065552 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -66,9 +66,7 @@ __all__ = [ 'scatter', 'sum', 'slice', - 'polygon_box_transform', 'shape', - 'iou_similarity', 'maxout', ] + __activations__ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 10028a8c6e33edcea27650d925ca7378b770f143..3bbd11d9836a62cdf9f2a84fc75e933001e12159 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -121,7 +121,7 @@ class ParallelExecutor(object): else: cpu_num = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - exec_strategy.num_threads = cpu_num + exec_strategy.num_threads = cpu_num * 2 if build_strategy is None: build_strategy = BuildStrategy() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 43f68ff4592df6757691b06db52cf5e0e2ebc6d7..c8e881a672ad25654bd28604abfafc2c569af7ca 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -49,6 +49,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_train) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) +list(REMOVE_ITEM TEST_OPS test_dist_transformer) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) @@ -61,4 +62,5 @@ if(WITH_DISTRIBUTE) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) +py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..ee8020a73546cb9037e9dc4be589c62bb1b6b937 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -0,0 +1,280 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import time +import math + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +import os +import sys +import transformer_model +import paddle.dataset.wmt16 as wmt16 + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + +WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio" + + +class ModelHyperParams(object): + # Dictionary size for source and target language. This model directly uses + # paddle.dataset.wmt16 in which , and token has + # alreay been added, but the token is not added. Transformer requires + # sequences in a mini-batch are padded to have the same length. A token is + # added into the original dictionary in paddle.dateset.wmt16. + + # size of source word dictionary. + src_vocab_size = 10000 + # index for token in source language. + src_pad_idx = src_vocab_size + + # size of target word dictionay + trg_vocab_size = 10000 + # index for token in target language. + trg_pad_idx = trg_vocab_size + + # position value corresponding to the token. + pos_pad_idx = 0 + + # max length of sequences. It should plus 1 to include position + # padding token for position encoding. + max_length = 50 + + # the dimension for word embeddings, which is also the last dimension of + # the input and output of multi-head attention, position-wise feed-forward + # networks, encoder and decoder. + + d_model = 512 + # size of the hidden layer in position-wise feed-forward networks. + d_inner_hid = 1024 + # the dimension that keys are projected to for dot-product attention. + d_key = 64 + # the dimension that values are projected to for dot-product attention. + d_value = 64 + # number of head used in multi-head attention. + n_head = 8 + # number of sub-layers to be stacked in the encoder and decoder. + n_layer = 6 + # dropout rate used by all dropout layers. + dropout = 0.1 + + +def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. Then, convert the numpy + data to tensors and return a dict mapping names to tensors. + """ + + def __pad_batch_data(insts, + pad_idx, + is_target=False, + return_pos=True, + return_attn_bias=True, + return_max_len=True): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + return_list = [] + max_len = max(len(inst) for inst in insts) + inst_data = np.array( + [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, 1])] + if return_pos: + inst_pos = np.array([[ + pos_i + 1 if w_i != pad_idx else 0 + for pos_i, w_i in enumerate(inst) + ] for inst in inst_data]) + + return_list += [inst_pos.astype("int64").reshape([-1, 1])] + if return_attn_bias: + if is_target: + # This is used to avoid attention on paddings and subsequent + # words. + slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, + max_len)) + slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( + [-1, 1, max_len, max_len]) + slf_attn_bias_data = np.tile(slf_attn_bias_data, + [1, n_head, 1, 1]) * [-1e9] + else: + # This is used to avoid attention on paddings. + slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * + (max_len - len(inst)) + for inst in insts]) + slf_attn_bias_data = np.tile( + slf_attn_bias_data.reshape([-1, 1, 1, max_len]), + [1, n_head, max_len, 1]) + return_list += [slf_attn_bias_data.astype("float32")] + if return_max_len: + return_list += [max_len] + return return_list if len(return_list) > 1 else return_list[0] + + src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, is_target=False) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, is_target=True) + trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], + [1, 1, trg_max_len, 1]).astype("float32") + lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False, + False, False, False) + lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) + + return [ + src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + ] + + +def transformer(use_feed): + assert not use_feed, "transfomer doesn't support feed yet" + return transformer_model.transformer( + ModelHyperParams.src_vocab_size + 1, + ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, ModelHyperParams.n_head, + ModelHyperParams.d_key, ModelHyperParams.d_value, + ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, + ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) + + +def get_model(): + avg_cost = transformer(use_feed=False) + optimizer = fluid.optimizer.Adam() + optimizer.minimize(avg_cost) + return avg_cost + + +def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): + t = fluid.DistributeTranspiler() + t.transpile( + trainer_id=trainer_id, + program=main_program, + pservers=pserver_endpoints, + trainers=trainers) + return t + + +class DistTransformer2x2(object): + def run_pserver(self, pserver_endpoints, trainers, current_endpoint, + trainer_id): + get_model() + t = get_transpiler(trainer_id, + fluid.default_main_program(), pserver_endpoints, + trainers) + pserver_prog = t.get_pserver_program(current_endpoint) + startup_prog = t.get_startup_program(current_endpoint, pserver_prog) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + exe.run(pserver_prog) + + def _wait_ps_ready(self, pid): + retry_times = 20 + while True: + assert retry_times >= 0, "wait ps ready failed" + time.sleep(3) + print("waiting ps ready: ", pid) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + retry_times -= 1 + + def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): + avg_cost = get_model() + if is_dist: + t = get_transpiler(trainer_id, + fluid.default_main_program(), endpoints, + trainers) + trainer_prog = t.get_trainer_program() + else: + trainer_prog = fluid.default_main_program() + + startup_exe = fluid.Executor(place) + startup_exe.run(fluid.default_startup_program()) + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + strategy.allow_op_delay = False + exe = fluid.ParallelExecutor( + True, loss_name=avg_cost.name, exec_strategy=strategy) + + first_loss, = exe.run(fetch_list=[avg_cost.name]) + print(first_loss) + for i in xrange(5): + _ = exe.run(fetch_list=[avg_cost.name]) + last_loss, = exe.run(fetch_list=[avg_cost.name]) + print(last_loss) + + +def main(role="pserver", + endpoints="127.0.0.1:9123", + trainer_id=0, + current_endpoint="127.0.0.1:9123", + trainers=1, + is_dist=True): + + reader = paddle.batch( + wmt16.train(ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=transformer_model.batch_size) + + with fluid.recordio_writer.create_recordio_writer( + WMT16_RECORDIO_FILE) as writer: + for batch in reader(): + for tensor in prepare_batch_input( + batch, ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head): + t = fluid.LoDTensor() + t.set(tensor, fluid.CPUPlace()) + writer.append_tensor(t) + writer.complete_append_tensor() + + model = DistTransformer2x2() + if role == "pserver": + model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) + else: + p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) + + +if __name__ == "__main__": + if len(sys.argv) != 7: + print( + "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" + ) + role = sys.argv[1] + endpoints = sys.argv[2] + trainer_id = int(sys.argv[3]) + current_endpoint = sys.argv[4] + trainers = int(sys.argv[5]) + is_dist = True if sys.argv[6] == "TRUE" else False + main( + role=role, + endpoints=endpoints, + trainer_id=trainer_id, + current_endpoint=current_endpoint, + trainers=trainers, + is_dist=is_dist) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 82b5e7cf0b3633eb04ab97c5300b1926b9d47cb6..2ddfd47fe0c33b0e9771fe6f502b90eb77161100 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -66,6 +66,10 @@ def get_numeric_gradient(place, tensor_to_check_dtype = np.float32 elif tensor_to_check_dtype == core.VarDesc.VarType.FP64: tensor_to_check_dtype = np.float64 + elif tensor_to_check_dtype == core.VarDesc.VarType.FP16: + tensor_to_check_dtype = np.float16 + # set delta as np.float16, will automatic convert to float32, float64 + delta = np.array(delta).astype(np.float16) else: raise ValueError("Not supported data type " + str( tensor_to_check_dtype)) @@ -73,13 +77,24 @@ def get_numeric_gradient(place, gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype) def __get_elem__(tensor, i): - if tensor_to_check_dtype == np.float32: + if tensor_to_check_dtype == np.float16: + numpy_tensor = np.array(tensor).astype(np.float16) + numpy_tensor = numpy_tensor.flatten() + return numpy_tensor[i] + elif tensor_to_check_dtype == np.float32: return tensor._get_float_element(i) else: return tensor._get_double_element(i) def __set_elem__(tensor, i, e): - if tensor_to_check_dtype == np.float32: + if tensor_to_check_dtype == np.float16: + numpy_tensor = np.array(tensor).astype(np.float16) + shape = numpy_tensor.shape + numpy_tensor = numpy_tensor.flatten() + numpy_tensor[i] = e + numpy_tensor = numpy_tensor.reshape(shape).view(np.uint16) + tensor.set(numpy_tensor, place) + elif tensor_to_check_dtype == np.float32: tensor._set_float_element(i, e) else: tensor._set_double_element(i, e) @@ -133,6 +148,11 @@ class OpTest(unittest.TestCase): if not self.call_once: self.call_once = True self.dtype = data_type + # See the comment of np_dtype_to_fluid_dtype + # If the input type is uint16, we assume use float16 + # for lodtensor dtype. + if self.dtype == np.uint16: + self.dtype == np.float16 def infer_dtype_from_inputs_outputs(self, inputs, outputs): def infer_dtype(numpy_dict): @@ -161,19 +181,25 @@ class OpTest(unittest.TestCase): for name, np_value in self.inputs[var_name]: tensor = core.LoDTensor() if isinstance(np_value, tuple): - tensor.set(np_value[0], place) + tensor.set( + OpTest.np_value_to_fluid_value(np_value[0]), place) tensor.set_recursive_sequence_lengths(np_value[1]) else: - tensor.set(np_value, place) + tensor.set( + OpTest.np_value_to_fluid_value(np_value), place) feed_map[name] = tensor else: tensor = core.LoDTensor() if isinstance(self.inputs[var_name], tuple): - tensor.set(self.inputs[var_name][0], place) + tensor.set( + OpTest.np_value_to_fluid_value(self.inputs[var_name][ + 0]), place) tensor.set_recursive_sequence_lengths(self.inputs[var_name][ 1]) else: - tensor.set(self.inputs[var_name], place) + tensor.set( + OpTest.np_value_to_fluid_value(self.inputs[var_name]), + place) feed_map[var_name] = tensor return feed_map @@ -307,13 +333,22 @@ class OpTest(unittest.TestCase): np.allclose( actual_t, expect_t, atol=atol), "Output (" + out_name + ") has diff at " + str(place) + - str(actual_t) + "\n" + str(expect_t)) + "\nExpect " + str(expect_t) + "\n" + "But Got" + + str(actual_t)) if isinstance(expect, tuple): self.assertListEqual(actual.recursive_sequence_lengths(), expect[1], "Output (" + out_name + ") has different lod at " + str(place)) def _get_places(self): + if self.dtype == np.float16: + if core.is_compiled_with_cuda() and core.op_support_gpu( + self.op_type): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + return [place] + else: + return [] places = [fluid.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) @@ -344,9 +379,9 @@ class OpTest(unittest.TestCase): def err_msg(): offset = np.argmax(diff_mat > max_relative_error) return ("%s Variable %s max gradient diff %f over limit %f, " - "the first error element is %d, %f, %f") % ( - msg_prefix, name, max_diff, max_relative_error, - offset, a.flatten()[offset], b.flatten()[offset]) + "the first error element is %d, expected %f, but got %f" + ) % (msg_prefix, name, max_diff, max_relative_error, + offset, a.flatten()[offset], b.flatten()[offset]) self.assertLessEqual(max_diff, max_relative_error, err_msg()) @@ -435,6 +470,21 @@ class OpTest(unittest.TestCase): input.dtype = np.uint16 return input + @staticmethod + def fluid_dtype_to_np_dtype(self, dtype): + """ + See above, convert the dtype to normal type. + """ + if dtype == np.uint16: + dtype = np.float16 + return dtype + + @staticmethod + def np_value_to_fluid_value(input): + if input.dtype == np.float16: + input = input.view(np.uint16) + return input + def _get_gradient(self, input_to_check, place, @@ -457,7 +507,7 @@ class OpTest(unittest.TestCase): if isinstance(place, fluid.CUDAPlace(0)): use_cuda = True executor = fluid.ParallelExecutor( - use_cuda=use_cuda, loss_name=loss.name, main_program=program) + use_cuda=use_cuda, loss_name=loss.name, main_program=prog) else: executor = Executor(place) return map(np.array, diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py new file mode 100644 index 0000000000000000000000000000000000000000..1aaab6f906ef6482bc515bb3c42d82431902e1d8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -0,0 +1,137 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import time + +import unittest +import os +import sys +import signal +import subprocess + + +class TestDistBase(unittest.TestCase): + def setUp(self): + self._trainers = 2 + self._pservers = 2 + self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" + self._python_interp = "python" + + def start_pserver(self, model_file): + ps0_ep, ps1_ep = self._ps_endpoints.split(",") + ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ + (self._python_interp, model_file, self._ps_endpoints, ps0_ep, + self._trainers) + ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ + (self._python_interp, model_file, self._ps_endpoints, ps1_ep, + self._trainers) + + ps0_proc = subprocess.Popen( + ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + ps1_proc = subprocess.Popen( + ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return ps0_proc, ps1_proc + + def _wait_ps_ready(self, pid): + retry_times = 50 + while True: + assert retry_times >= 0, "wait ps ready failed" + time.sleep(3) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error as e: + sys.stderr.write('waiting for pserver: %s, left retry %d\n' % + (e, retry_times)) + retry_times -= 1 + + def check_with_place(self, model_file, delta=1e-3): + # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN + required_envs = { + "PATH": os.getenv("PATH"), + "PYTHONPATH": os.getenv("PYTHONPATH"), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"), + "FLAGS_fraction_of_gpu_memory_to_use": "0.15" + } + # Run local to get a base line + env_local = {"CUDA_VISIBLE_DEVICES": "0"} + env_local.update(required_envs) + local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \ + (self._python_interp, model_file, + "127.0.0.1:1234", "127.0.0.1:1234", 1) + local_proc = subprocess.Popen( + local_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env_local) + local_proc.wait() + out, err = local_proc.communicate() + local_ret = out + sys.stderr.write('local_loss: %s\n' % local_ret) + sys.stderr.write('local_stderr: %s\n' % err) + + # Run dist train to compare with local results + ps0, ps1 = self.start_pserver(model_file) + self._wait_ps_ready(ps0.pid) + self._wait_ps_ready(ps1.pid) + + ps0_ep, ps1_ep = self._ps_endpoints.split(",") + tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \ + (self._python_interp, model_file, self._ps_endpoints, ps0_ep, + self._trainers) + tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \ + (self._python_interp, model_file, self._ps_endpoints, ps1_ep, + self._trainers) + + env0 = {"CUDA_VISIBLE_DEVICES": "0"} + env1 = {"CUDA_VISIBLE_DEVICES": "1"} + env0.update(required_envs) + env1.update(required_envs) + FNULL = open(os.devnull, 'w') + + tr0_proc = subprocess.Popen( + tr0_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env0) + tr1_proc = subprocess.Popen( + tr1_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env1) + + tr0_proc.wait() + tr1_proc.wait() + out, err = tr0_proc.communicate() + sys.stderr.write('dist_stderr: %s\n' % err) + loss_data0 = out + sys.stderr.write('dist_loss: %s\n' % loss_data0) + lines = loss_data0.split("\n") + dist_first_loss = eval(lines[0].replace(" ", ","))[0] + dist_last_loss = eval(lines[1].replace(" ", ","))[0] + + local_lines = local_ret.split("\n") + local_first_loss = eval(local_lines[0])[0] + local_last_loss = eval(local_lines[1])[0] + + self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta) + self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta) + + # check tr0_out + # FIXME: ensure the server process is killed + # replace with ps0.terminate() + os.kill(ps0.pid, signal.SIGKILL) + os.kill(ps1.pid, signal.SIGKILL) + FNULL.close() diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index 3b67b3f5ccd67f86f87f292d83a6039ff46260bd..04671d079731ce414561b0ede6bc2b195b07d82a 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -11,127 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import numpy as np -import argparse -import time -import math - import unittest -import os -import sys -import signal -import subprocess - - -class TestDistSeResneXt2x2(unittest.TestCase): - def setUp(self): - self._trainers = 2 - self._pservers = 2 - self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" - self._python_interp = "python" - - def start_pserver(self): - ps0_ep, ps1_ep = self._ps_endpoints.split(",") - ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \ - (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers) - ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \ - (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers) - - ps0_proc = subprocess.Popen( - ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) - ps1_proc = subprocess.Popen( - ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return ps0_proc, ps1_proc - - def _wait_ps_ready(self, pid): - retry_times = 20 - while True: - assert retry_times >= 0, "wait ps ready failed" - time.sleep(3) - try: - # the listen_and_serv_op would touch a file which contains the listen port - # on the /tmp directory until it was ready to process all the RPC call. - os.stat("/tmp/paddle.%d.port" % pid) - return - except os.error: - retry_times -= 1 - - def test_with_place(self): - # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN - required_envs = { - "PATH": os.getenv("PATH"), - "PYTHONPATH": os.getenv("PYTHONPATH"), - "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"), - "FLAGS_fraction_of_gpu_memory_to_use": "0.15" - } - # Run local to get a base line - env_local = {"CUDA_VISIBLE_DEVICES": "0"} - env_local.update(required_envs) - local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \ - (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1) - local_proc = subprocess.Popen( - local_cmd.split(" "), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env_local) - local_proc.wait() - out, err = local_proc.communicate() - local_ret = out - sys.stderr.write('local_loss: %s\n' % local_ret) - sys.stderr.write('local_stderr: %s\n' % err) - - # Run dist train to compare with local results - ps0, ps1 = self.start_pserver() - self._wait_ps_ready(ps0.pid) - self._wait_ps_ready(ps1.pid) - - ps0_ep, ps1_ep = self._ps_endpoints.split(",") - tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \ - (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers) - tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \ - (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers) - - env0 = {"CUDA_VISIBLE_DEVICES": "0"} - env1 = {"CUDA_VISIBLE_DEVICES": "1"} - env0.update(required_envs) - env1.update(required_envs) - FNULL = open(os.devnull, 'w') - - tr0_proc = subprocess.Popen( - tr0_cmd.split(" "), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env0) - tr1_proc = subprocess.Popen( - tr1_cmd.split(" "), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env1) - - tr0_proc.wait() - tr1_proc.wait() - out, err = tr0_proc.communicate() - sys.stderr.write('dist_stderr: %s\n' % err) - loss_data0 = out - sys.stderr.write('dist_loss: %s\n' % loss_data0) - lines = loss_data0.split("\n") - dist_first_loss = eval(lines[0].replace(" ", ","))[0] - dist_last_loss = eval(lines[1].replace(" ", ","))[0] - - local_lines = local_ret.split("\n") - local_first_loss = eval(local_lines[0])[0] - local_last_loss = eval(local_lines[1])[0] +from test_dist_base import TestDistBase - self.assertAlmostEqual(local_first_loss, dist_first_loss) - self.assertAlmostEqual(local_last_loss, dist_last_loss) - # check tr0_out - # FIXME: ensure the server process is killed - # replace with ps0.terminate() - os.kill(ps0.pid, signal.SIGKILL) - os.kill(ps1.pid, signal.SIGKILL) - FNULL.close() +class TestDistSeResneXt2x2(TestDistBase): + def test_se_resnext(self): + # TODO(paddle-dev): Is the delta too large? + self.check_with_place("dist_se_resnext.py", delta=0.2) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..68cd35d751dbce7eef9919dc8678fc0dd117757b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from test_dist_base import TestDistBase + + +class TestDistTransformer2x2(TestDistBase): + def test_transformer(self): + # TODO(paddle-dev): check if the delta is OK. + # Usually start around ~8000 and converge to ~5000 + self.check_with_place("dist_transformer.py", delta=400) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py index acf652d3fb9743d69b7f7e248ff7a3ee83fc4c50..1854232194963bcbe302010320a30d85747eea96 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py @@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest): def setUp(self): self.op_type = "elementwise_sub" self.inputs = { - 'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"), - 'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32") + 'X': np.random.uniform(0.1, 1, [2, 3]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [2, 3]).astype("float32") } self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index d090960c84e47da68a0ebea4609dfc3ed76e114e..daa5da8d95129af0305b326832a557daeb4c5c9c 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -17,6 +17,8 @@ import numpy as np import math from op_test import OpTest +np.random.seed(100) + def find_latest_set(num): return 1 + int(math.floor(math.log(num, 2))) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index ab2ab24f354c1fbdc8b5221061db56a8d8a48689..9e1b47643a554bc14170fc57ac05b21afdb8117a 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -465,6 +465,15 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def test_shape(self): + program = Program() + with program_guard(program): + input = layers.data( + name="input", shape=[3, 100, 100], dtype="float32") + out = layers.shape(input, name="shape") + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 76389d916fc39f470a22aed4792bf7b754600436..c7a039d2589ef67bd1d3771a2f11084698ba909f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -211,7 +211,8 @@ class TestMNIST(TestParallelExecutorBase): self.check_batchnorm_fc_convergence(False) def test_batchnorm_fc_with_new_strategy(self): - self._compare_reduce_and_allreduce(fc_with_batchnorm, True) + # FIXME(zcd): close this test temporally. + # self._compare_reduce_and_allreduce(fc_with_batchnorm, True) self._compare_reduce_and_allreduce(fc_with_batchnorm, False) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index b6215fddb11bb6b3a76b5a6395e7254d21971c13..8203d5d1fce0950130ab71db40fb306f73c41bd4 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -21,7 +21,7 @@ import paddle import paddle.dataset.wmt16 as wmt16 import os -WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio" +WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio" class ModelHyperParams(object): @@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase): writer.append_tensor(t) writer.complete_append_tensor() - @unittest.skip("transformer is buggy in multi gpu") def test_main(self): self.check_network_convergence(transformer, use_cuda=True) - self.check_network_convergence(transformer, use_cuda=False) + self.check_network_convergence(transformer, use_cuda=False, iter=5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py index 55c6e54906e739ef0bc953fa5c9e9641ec575ccf..910d9538b009496813f40b82d62eb2b12964a99f 100644 --- a/python/paddle/fluid/tests/unittests/testsuite.py +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -18,14 +18,6 @@ import paddle.fluid.core as core from paddle.fluid.op import Operator -def as_lodtensor(np_array, lod, place): - tensor = core.LoDTensor() - tensor.set(np_value, place) - if lod is not None: - tensor.set_recursive_sequence_lengths(lod) - return tensor - - def create_op(scope, op_type, inputs, outputs, attrs): kwargs = dict() @@ -69,6 +61,11 @@ def create_op(scope, op_type, inputs, outputs, attrs): def set_input(scope, op, inputs, place): + def np_value_to_fluid_value(input): + if input.dtype == np.float16: + input = input.view(np.uint16) + return input + def __set_input__(var_name, var): if isinstance(var, tuple) or isinstance(var, np.ndarray): tensor = scope.find_var(var_name).get_tensor() @@ -76,7 +73,7 @@ def set_input(scope, op, inputs, place): tensor.set_recursive_sequence_lengths(var[1]) var = var[0] tensor._set_dims(var.shape) - tensor.set(var, place) + tensor.set(np_value_to_fluid_value(var), place) elif isinstance(var, float): scope.find_var(var_name).set_float(var) elif isinstance(var, int): @@ -104,6 +101,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype): if name not in np_list: assert var_proto.intermediate, "{} not found".format(name) else: + # inferece the dtype from numpy value. np_value = np_list[name] if isinstance(np_value, tuple): dtype = np_value[0].dtype @@ -116,6 +114,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype): if is_input: shape = list(np_value.shape) lod_level = 0 + # NOTE(dzhwinter): type hacking + # numpy float16 is binded to paddle::platform::float16 + # in tensor_py.h via the help of uint16 datatype. Because + # the internal memory representation of float16 is + # actually uint16_t in paddle. So we use np.uint16 in numpy for + # raw memory, it can pass through the pybind. So in the testcase, + # we feed data use data.view(uint16), but the dtype is float16 in fact. + # The data.view(uint16) means do not cast the data type, but process data as the uint16 + if dtype == np.uint16: + dtype = np.float16 return block.create_var( dtype=dtype, shape=shape, lod_level=lod_level, name=name) diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index c62792face3c353db1f2e3c77eaf4bd32fbded69..d0eb3fd3724899aad39422983fd3cd0d00ff2a2d 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -403,7 +403,7 @@ def transformer( trg_pad_idx, pos_pad_idx, ): file_obj = fluid.layers.open_recordio_file( - filename='./wmt16.recordio', + filename='/tmp/wmt16.recordio', shapes=[ [batch_size * max_length, 1], [batch_size * max_length, 1],