diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc index 878b950858a71ba0e10ab2643667922420d29099..c44793cd11d22b29b4b3422a047d81fe26624982 100644 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc @@ -13,125 +13,186 @@ // limitations under the License. #include -#include +#include #include #include #include +#include #include -#include "paddle/fluid/framework/details/all_reduce_deps_pass.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/op_graph_view.h" -#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { namespace framework { namespace details { -VarHandle* GetValidInput(const OpHandleBase* a) { - for (auto p : a->Inputs()) { - VarHandle* b = dynamic_cast(p); - if (b) { - return b; +class AllReduceDepsPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override { + std::vector all_reduce_op_handles = + GetSortedAllReduceOps(*graph); + + for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) { + auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dep_var); + all_reduce_op_handles[i - 1]->AddOutput(dep_var); + all_reduce_op_handles[i]->AddInput(dep_var); } - } - return nullptr; -} - -void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const { - auto graph_ops = ir::FilterByNodeWrapper(*graph); - - // get vars order - int order = 0; - std::unordered_map vars; - // TODO(gongwb): use graph topology sort to find the order of operators. - // Note that must assert topology sort is stable - auto& ops = graph->Get>(kStaleProgramOpDescs); - for (auto* op_desc : ops) { - try { - bool is_bk_op = - static_cast(boost::get(op_desc->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kBackward)); - if (!is_bk_op) continue; - - auto backward_vars = - boost::get>(op_desc->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - - auto outputs = op_desc->Outputs(); - for (auto& o_it : outputs) { - for (auto& v : o_it.second) { // values - vars[v] = order; - VLOG(10) << "in all_reduce_deps_pass:" << v; - } - } - order++; - } catch (boost::bad_get e) { + if (VLOG_IS_ON(10)) { + DebugString(*graph, all_reduce_op_handles); } } - std::vector dist_ops; - // get allreduce ops. - for (auto& op : graph_ops) { - // FIXME(gongwb):add broad cast. - if (op->Name() == "all_reduce" || op->Name() == "reduce") { - dist_ops.push_back(op); + std::vector GetSortedAllReduceOps( + const ir::Graph& graph) const { + std::vector all_reduce_op_handles; + std::unordered_map pending_ops; + std::unordered_set ready_ops; + std::unordered_set next_ready_ops; + + auto op_handles = ir::FilterByNodeWrapper(graph); + size_t num_of_ops = op_handles.size(); + for (OpHandleBase* op : op_handles) { + size_t not_ready_vars = op->NotReadyInputSize(); + if (not_ready_vars) { + pending_ops.insert({op, not_ready_vars}); + } else { + ready_ops.insert(op); + } } - } - - VLOG(10) << "dist_ops size:" << dist_ops.size() - << ", outputs size:" << vars.size() << ", ops size:" << ops.size(); - - std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1, - OpHandleBase* op2) { - VarHandle* i0 = dynamic_cast(GetValidInput(op1)); - VarHandle* i1 = dynamic_cast(GetValidInput(op2)); - - PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error", - op1->DebugString(), op2->DebugString()); - auto l_it = vars.find(i0->name()); - auto r_it = vars.find(i1->name()); - - PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(), - "can't find var's name %s and %s in opdesc", i0->name(), - i1->name()); - - if (l_it->second < r_it->second) return true; + GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles); + + size_t has_run_ops = ready_ops.size(); + while (has_run_ops != num_of_ops) { + for (auto* op : ready_ops) { + for (auto& ready_var : op->Outputs()) { + for (auto* pend_op : ready_var->PendingOps()) { + auto& deps = --pending_ops[pend_op]; + if (deps == 0) { + next_ready_ops.insert(pend_op); + } + } + } + } - if (l_it->second == r_it->second) { - return i0->name() < i1->name(); + PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle."); + ready_ops.clear(); + std::swap(ready_ops, next_ready_ops); + GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles); + has_run_ops += ready_ops.size(); } + return all_reduce_op_handles; + } - return false; - }); - - // add dependency. - auto& sorted_ops = dist_ops; - for (size_t i = 1; i < sorted_ops.size(); ++i) { - auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - - auto* pre_op = sorted_ops[i - 1]; - auto* op = sorted_ops[i]; - - pre_op->AddOutput(dep_var); - op->AddInput(dep_var); - graph->Get(kGraphDepVars).emplace(dep_var); + void GetSortedAllReduceOps( + const std::unordered_set& ready_ops, + std::vector* all_reduce_op_handles) const { + std::vector current_all_reduce_op_handles; + for (auto& op_handle : ready_ops) { + auto all_reduce_op_handle = dynamic_cast(op_handle); + if (all_reduce_op_handle) { + current_all_reduce_op_handles.emplace_back(all_reduce_op_handle); + } + } - VLOG(10) << "add all_reduce sequential dependencies between " << pre_op - << " and " << op; + // NOTE(zcd): For distributed training, it is important to keep the order of + // allReduce on each node consistent. Otherwise, hang may occur. + // Sort the current_all_reduce_op_handles according to the name of input. + sort(current_all_reduce_op_handles.begin(), + current_all_reduce_op_handles.end(), + [](const AllReduceOpHandle* left, + const AllReduceOpHandle* right) -> bool { + auto left_in_vars = DynamicCast(left->Inputs()); + auto right_in_vars = DynamicCast(right->Inputs()); + PADDLE_ENFORCE_GT(left_in_vars.size(), 0); + PADDLE_ENFORCE_EQ(left_in_vars.size(), right_in_vars.size()); + return left_in_vars[0]->Name() > right_in_vars[0]->Name(); + }); + + all_reduce_op_handles->insert(all_reduce_op_handles->end(), + current_all_reduce_op_handles.begin(), + current_all_reduce_op_handles.end()); + } - VLOG(10) << "pre_op:" << pre_op->DebugString() - << ", op:" << op->DebugString(); + void DebugString( + const ir::Graph& graph, + const std::vector& all_reduce_op_handles) const { + // get vars order + std::map> vars = + GetSoredGradientsFromStaleProgram(graph); + std::stringstream out; + size_t grads_of_stale_program = 0; + out << "Get Order From kStaleProgramOpDescs: "; + for (auto& var : vars) { + out << "Order " << var.first << " ["; + for (auto& var_name : var.second) { + out << var_name << ", "; + ++grads_of_stale_program; + } + out << "], "; + } + VLOG(10) << out.str(); + + std::stringstream out2; + out2 << "Get Order From Topological order: "; + for (auto& op : all_reduce_op_handles) { + bool find_valid_input = false; + for (auto& in_var : op->Inputs()) { + if (dynamic_cast(in_var)) { + out2 << in_var->Name() << ", "; + find_valid_input = true; + break; + } + } + PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input."); + } + VLOG(10) << out2.str(); + if (grads_of_stale_program != all_reduce_op_handles.size()) { + VLOG(10) + << "The gradients number of stale program and graph is not equal."; + } } -} + std::map> GetSoredGradientsFromStaleProgram( + const ir::Graph& graph) const { + std::map> vars; + auto ops = graph.Get>(kStaleProgramOpDescs); + int order = 0; + for (auto* op_desc : ops) { + try { + bool is_bk_op = + static_cast(boost::get(op_desc->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + + auto backward_vars = + boost::get>(op_desc->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + if (backward_vars.empty()) continue; + + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + for (size_t i = 1; i < backward_vars.size(); i += 2) { + vars[order].emplace_back(backward_vars[i]); + VLOG(1) << "get parameter and gradient: " << backward_vars[i - 1] + << ", " << backward_vars[i]; + } + order++; + } catch (boost::bad_get e) { + } + } + return vars; + } +}; } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/all_reduce_deps_pass.h deleted file mode 100644 index 4ed3736587aa3d45e288e3dc7e6ab3099f935f41..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace details { - -// TODO(gongwb): overlap allreduce with backward computation. -class AllReduceDepsPass : public ir::Pass { - protected: - void ApplyImpl(ir::Graph* graph) const override; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 6e477cd2977561ddb914e4a6343f677044fad4be..ed75b48090b27a9c430afe067467a1a39d711938 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -28,7 +28,7 @@ // asynchronous nccl allreduce or synchronous issue: // https://github.com/PaddlePaddle/Paddle/issues/15049 DEFINE_bool( - sync_nccl_allreduce, false, + sync_nccl_allreduce, true, "If set true, will call `cudaStreamSynchronize(nccl_stream)`" "after allreduce, this mode can get better performance in some scenarios."); diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 027de4cda410178fbae11f1db9a580c2b7ad22a3..f8bf43bcb48226b4d1317a78ade7179741097378 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -163,14 +163,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { "graph_printer", new details::GraphvizSSAGraphPrinter); } - // Verify that the graph is correct for multi-device executor. - AppendPass("multi_devices_check_pass"); - - if (VLOG_IS_ON(2)) { - AppendPass("all_reduce_deps_pass"); - } - - if (SeqOnlyAllReduceOps(strategy_)) { + // experimental shows that the program will be faster if append + // all_reduce_deps_pass here. + if (!strategy_.enable_parallel_graph_ && + (SeqOnlyAllReduceOps(strategy_) || + strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) { VLOG(10) << "Add all_reduce_deps_pass"; AppendPass("all_reduce_deps_pass"); } @@ -179,6 +176,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { VLOG(10) << "Add modify_op_lock_and_record_event_pass"; AppendPass("modify_op_lock_and_record_event_pass"); } + + // Verify that the graph is correct for multi-device executor. + AppendPass("multi_devices_check_pass"); } // Convert graph to run on multi-devices. diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 5811693b7ce6d6f2aebc9a8896960226295bd3e5..cc48c51e924039d93b2e1e18bea752611e7bef92 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -91,7 +91,11 @@ struct BuildStrategy { bool enable_sequential_execution_{false}; - bool fuse_broadcast_op_{false}; + // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program + // faster. Because fusing broadcast OP equals delaying the execution of all + // broadcast Ops, in this case, all nccl streams are used only for reduce + // operations for a period of time. + bool fuse_broadcast_ops_{false}; // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // num_trainers is 1, so the current fields of build_strategy doesn't tell if diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 297ee92fc3c84c2feec9cb85bd8671ce8ad94ed0..3e805bd5b480241954960f92a72514723c3a8bb7 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -56,6 +56,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( fetches.resize(fetch_tensors.size()); std::unordered_map> fetched_vars; std::vector fetch_ops; + std::vector ready_fetch_ops; for (auto &fetch_var_name : fetch_tensors) { for (auto &var_map : graph_->Get(details::kGraphVars)) { @@ -70,8 +71,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( auto &var_name = fetch_tensors[i]; auto fetched_var_it = fetched_vars.find(var_name); PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(), - "Cannot find fetched variable.(Perhaps the main_program " - "is not set to ParallelExecutor)"); + "Cannot find fetched variable(%s).(Perhaps the main_program " + "is not set to ParallelExecutor)", + var_name); auto &vars = fetched_var_it->second; @@ -88,7 +90,11 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( op->AddInput(var); } - (*op_deps)[op] = static_cast(op->NotReadyInputSize()); + int dep = static_cast(op->NotReadyInputSize()); + (*op_deps)[op] = dep; + if (dep == 0) { + ready_fetch_ops.emplace_back(op); + } } size_t num_complete = 0; @@ -97,7 +103,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( for (auto op : bootstrap_ops_) { RunOpAsync(op_deps.get(), op, complete_q); } - + for (auto op : ready_fetch_ops) { + RunOpAsync(op_deps.get(), op, complete_q); + } while (num_complete != op_deps->size()) { size_t num_comp = complete_q->Pop(); if (num_comp == -1UL) { diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 232d82a5da596a78d2999c4a4c4f7dda0c7cad7e..6c8b8937ebe646042f71cb58cfbc2d32426a4e3c 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/details/fetch_op_handle.h" - #include #include +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -44,6 +44,7 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const { } void FetchOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); WaitInputVarGenerated(platform::CPUPlace()); tensors_.resize(inputs_.size()); @@ -62,7 +63,8 @@ void FetchOpHandle::RunImpl() { auto &t = var->Get(); if (platform::is_gpu_place(t.place())) { #ifdef PADDLE_WITH_CUDA - TensorCopySync(t, cpu, &tensors_[i]); + TensorCopy(t, cpu, *dev_ctxes_.at(t.place()), &tensors_[i]); + dev_ctxes_.at(t.place())->Wait(); #endif } else { tensors_[i].ShareDataWith(t); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index e22bd3917895be8d84a83b3986c6919564b2ddab..f213e07b555ca9fc4b73a2f91412063f4e7f47d4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -658,7 +658,7 @@ bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { if (UseGPU()) { - if (strategy_.fuse_broadcast_op_) { + if (strategy_.fuse_broadcast_ops_) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { @@ -1021,7 +1021,7 @@ void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { return; } - if (strategy_.fuse_broadcast_op_) { + if (strategy_.fuse_broadcast_ops_) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 413b14961631b3459e0d05af685ad1c5395844c2..69cd84ebf2d678c089141f09a92c46e3a03fe4d9 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -68,7 +68,7 @@ void OpHandleBase::Run(bool use_cuda) { if (out_var_handle) { PADDLE_ENFORCE( platform::is_same_place(place, out_var_handle->place()), - "The place of input(%s) is not consistent with the " + "The place of output(%s) is not consistent with the " "place of current op(%s).", out_var_handle->Name(), Name()); out_var_handle->SetGenerateEvent(events_.at(dev_id)); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 5dde0b76b816bc5309b455d58deb8942300c6af5..67246a4dd448b0ce2f115d6438c5fdd6cc39ca6d 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -80,7 +80,6 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl( } set.clear(); }; - auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); }; // Clean run context run_op_futures_.clear(); exception_holder_.Clear(); @@ -116,7 +115,7 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl( auto &deps = pending_ops[op]; --deps; if (deps == 0) { - run_all_op(op); + ready_ops.insert(op); } } } diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 353db435213c74982d582e5be298ecfb1a810f30..e6f5b15af8cd440a9304235acfe62787c5f1b134 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -617,6 +617,25 @@ void OpDesc::Flush() { static std::once_flag init_infer_shape_funcs; +/** + * NOTE(paddle-dev): Very tricky code here. Maybe we should find a + * better way to register compile-time infershape method gentlely. + * + * Normally, we can register a class derived from InferShapeBase, so that + * we can set the field of `infer_shape_` inside OpInfo when registering op. + * + * However, there is another way we can set the field of `infer_shape_` inside + * OpInfo. Usually, we overload InferShape method of OperatorWithKernel. After + * running the following method InitInferShapeFuncs, `infer_shape_` would be set + * to be the InferShape method of OperatorWithKernel. That is to say, we borrow + * the run-time InferShape method of OperatorWithKernel to be the compile-time + * InferShape method. + * + * However, during compiling time, we may not know inputs, outputs and attrs of + * run-time OperatorWithKernel. So the following code creates a fake + * OperatorWithKernel object. That is why the field info_ of OperatorBase + * would be null. + */ static void InitInferShapeFuncs() { std::call_once(init_infer_shape_funcs, [] { auto &map = OpInfoMap::Instance(); @@ -628,11 +647,16 @@ static void InitInferShapeFuncs() { PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered", op_type); auto &op_info = it->second; - auto op = static_cast(op_info.Creator()( - "", VariableNameMap{}, VariableNameMap{}, AttributeMap{})); if (op_info.infer_shape_) { // infer_shape has been registered. continue; } + + auto op = dynamic_cast(op_info.Creator()( + "", VariableNameMap{}, VariableNameMap{}, AttributeMap{})); + + PADDLE_ENFORCE_NOT_NULL( + op, "InferShapeBase is not registered to Operator %s", op_type); + op_info.infer_shape_ = [op](InferShapeContext *ctx) { op->InferShape(ctx); }; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index a2a8083da955c93175ab2f01a37737c145e6f1b8..4245caf1689c76d72b410c742488c55562c8b998 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -19,11 +19,6 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/framework/ir/graph_helper.h" - -#include "paddle/fluid/framework/ir/graph.h" - -#include "paddle/fluid/framework/details/all_reduce_deps_pass.h" #include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" @@ -31,6 +26,8 @@ limitations under the License. */ #include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/profiler.h" #ifdef WITH_GPERFTOOLS diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index e5036d940197ef012cbfd8f52700c8aeb54fb6c5..0109b4a4fa7617880f642f5a39639bca38475515 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -142,7 +142,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { void AnalysisConfig::EnableMKLDNN() { #ifdef PADDLE_WITH_MKLDNN - pass_builder()->EnableMKLDNN(); use_mkldnn_ = true; #else LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; @@ -235,16 +234,13 @@ void AnalysisConfig::Update() { } if (use_mkldnn_) { +#ifdef PADDLE_WITH_MKLDNN if (!enable_ir_optim_) { LOG(ERROR) << "EnableMKLDNN() only works when IR optimization is enabled."; + } else { + pass_builder()->EnableMKLDNN(); } -#ifdef PADDLE_WITH_MKLDNN - pass_builder()->EnableMKLDNN(); - use_mkldnn_ = true; -#else - LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; - use_mkldnn_ = false; #endif } @@ -256,9 +252,6 @@ void AnalysisConfig::Update() { } #ifdef PADDLE_WITH_MKLDNN pass_builder()->EnableMkldnnQuantizer(); -#else - LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; - use_mkldnn_quantizer_ = false; #endif } diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 258a79fa4e884177490fab79778151ae52537aa0..c89dd41e0a6283e0723e2925f28c0372cda6a2b2 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -27,6 +27,7 @@ #include #include #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/port.h" #include "paddle/fluid/string/printf.h" @@ -266,17 +267,17 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) { } static void PrintTime(int batch_size, int repeat, int num_threads, int tid, - double latency, int epoch = 1) { - LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat - << ", threads: " << num_threads << ", thread id: " << tid - << ", latency: " << latency << "ms, fps: " << 1 / (latency / 1000.f) + double batch_latency, int epoch = 1) { + PADDLE_ENFORCE(batch_size > 0, "Non-positive batch size."); + double sample_latency = batch_latency / batch_size; + LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid << " ======"; - if (epoch > 1) { - int samples = batch_size * epoch; - LOG(INFO) << "====== sample number: " << samples - << ", average latency of each sample: " << latency / samples - << "ms ======"; - } + LOG(INFO) << "====== batch_size: " << batch_size << ", iterations: " << epoch + << ", repetitions: " << repeat << " ======"; + LOG(INFO) << "====== batch latency: " << batch_latency + << "ms, number of samples: " << batch_size * epoch + << ", sample latency: " << sample_latency + << "ms, fps: " << 1000.f / sample_latency << " ======"; } static bool IsFileExists(const std::string &path) { diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 1d1d39e44096b9f50e5bc9603fa12aba92b0e8e2..87e02a02caebd93d701dfd9e51c35fb974c770ed 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -64,10 +64,12 @@ void PaddlePassBuilder::DeletePass(size_t idx) { passes_.erase(std::begin(passes_) + idx); } -void GpuPassStrategy::EnableMKLDNN() { - LOG(ERROR) << "GPU not support MKLDNN yet"; +void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { + analysis_passes_.push_back(pass); } +void PaddlePassBuilder::ClearPasses() { passes_.clear(); } + // The following passes works for Anakin sub-graph engine. const std::vector kAnakinSubgraphPasses({ "infer_clean_graph_pass", // @@ -102,12 +104,12 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { use_gpu_ = true; } -void GpuPassStrategy::EnableMkldnnQuantizer() { - LOG(ERROR) << "GPU not support MKL-DNN quantization"; +void GpuPassStrategy::EnableMKLDNN() { + LOG(ERROR) << "GPU not support MKLDNN yet"; } -void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { - analysis_passes_.push_back(pass); +void GpuPassStrategy::EnableMkldnnQuantizer() { + LOG(ERROR) << "GPU not support MKL-DNN quantization"; } CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { @@ -135,5 +137,39 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { }); use_gpu_ = false; } -void PaddlePassBuilder::ClearPasses() { passes_.clear(); } + +void CpuPassStrategy::EnableMKLDNN() { +// TODO(Superjomn) Consider the way to mix CPU with GPU. +#ifdef PADDLE_WITH_MKLDNN + if (!use_mkldnn_) { + passes_.insert(passes_.begin(), "mkldnn_placement_pass"); + + for (auto &pass : std::vector( + {"depthwise_conv_mkldnn_pass", // + "conv_bn_fuse_pass", // Execute BN passes again to + "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order + "conv_bias_mkldnn_fuse_pass", // + "conv3d_bias_mkldnn_fuse_pass", // + "conv_elementwise_add_mkldnn_fuse_pass", + "conv_relu_mkldnn_fuse_pass"})) { + passes_.push_back(pass); + } + } + use_mkldnn_ = true; +#else + use_mkldnn_ = false; +#endif +} + +void CpuPassStrategy::EnableMkldnnQuantizer() { +#ifdef PADDLE_WITH_MKLDNN + if (!use_mkldnn_quantizer_) { + passes_.push_back("cpu_quantize_placement_pass"); + } + use_mkldnn_quantizer_ = true; +#else + use_mkldnn_quantizer_ = false; +#endif +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 48da8c156f426477011bcc060260c812ad94df23..09ef195d5e66aff0cef17f1594de34c656187a35 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -109,43 +109,16 @@ class CpuPassStrategy : public PassStrategy { CpuPassStrategy(); explicit CpuPassStrategy(const CpuPassStrategy &other) - : PassStrategy(other.AllPasses()) {} + : PassStrategy(other.AllPasses()) { + use_gpu_ = other.use_gpu_; + use_mkldnn_ = other.use_mkldnn_; + use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_; + } virtual ~CpuPassStrategy() = default; - void EnableMKLDNN() override { -// TODO(Superjomn) Consider the way to mix CPU with GPU. -#ifdef PADDLE_WITH_MKLDNN - if (!use_mkldnn_) { - passes_.insert(passes_.begin(), "mkldnn_placement_pass"); - - for (auto &pass : std::vector( - {"depthwise_conv_mkldnn_pass", // - "conv_bn_fuse_pass", // Execute BN passes again to - "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order - "conv_bias_mkldnn_fuse_pass", // - "conv3d_bias_mkldnn_fuse_pass", // - "conv_relu_mkldnn_fuse_pass", // - "conv_elementwise_add_mkldnn_fuse_pass"})) { - passes_.push_back(pass); - } - } - use_mkldnn_ = true; -#else - use_mkldnn_ = false; -#endif - } - - void EnableMkldnnQuantizer() override { -#ifdef PADDLE_WITH_MKLDNN - if (!use_mkldnn_quantizer_) { - passes_.push_back("cpu_quantize_placement_pass"); - } - use_mkldnn_quantizer_ = true; -#else - use_mkldnn_quantizer_ = false; -#endif - } + void EnableMKLDNN() override; + void EnableMkldnnQuantizer() override; protected: bool use_mkldnn_quantizer_{false}; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 647913cc80727786379e2e5525b372818e423d23..8ecb0310c9775393631b99681e13cbea7a5b781e 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -26,7 +26,11 @@ endfunction() function(inference_analysis_api_int8_test target model_dir data_dir filename) inference_analysis_test(${target} SRCS ${filename} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark - ARGS --infer_model=${model_dir}/model --infer_data=${data_dir}/data.bin --batch_size=100) + ARGS --infer_model=${model_dir}/model + --infer_data=${data_dir}/data.bin + --warmup_batch_size=100 + --batch_size=50 + --iterations=2) endfunction() function(inference_analysis_api_test_with_fake_data target install_dir filename model_name) @@ -146,22 +150,22 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_con # int8 image classification tests if(WITH_MKLDNN) - set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8") + set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2") if (NOT EXISTS ${INT8_DATA_DIR}) - inference_download_and_uncompress(${INT8_DATA_DIR} ${INFERENCE_URL}"/int8" "imagenet_val_100.tar.gz") + inference_download_and_uncompress(${INT8_DATA_DIR} "${INFERENCE_URL}/int8" "imagenet_val_100_tail.tar.gz") endif() #resnet50 int8 set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50") if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR}) - inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} ${INFERENCE_URL}"/int8" "resnet50_int8_model.tar.gz" ) + inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "${INFERENCE_URL}/int8" "resnet50_int8_model.tar.gz" ) endif() inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL) #mobilenet int8 set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet") if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR}) - inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} ${INFERENCE_URL}"/int8" "mobilenetv1_int8_model.tar.gz" ) + inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "${INFERENCE_URL}/int8" "mobilenetv1_int8_model.tar.gz" ) endif() inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL) endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc index e73358d8827a40786beb05fad931267b0dd88f6b..9b2e74ec16eb3b6e98bfcc8cc546ed74a7966f33 100644 --- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc @@ -154,7 +154,7 @@ void profile(bool use_mkldnn = false) { config.EnableMKLDNN(); } - std::vector outputs; + std::vector> outputs; std::vector> inputs; LoadInputData(&inputs); TestPrediction(reinterpret_cast(&config), diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 735e4fb563788438ee49ff6308d11f4dbe4962be..e10d239a5d1b30e089a110c6155520e3b035860a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -197,7 +197,7 @@ void profile(bool use_mkldnn = false) { cfg.SetMKLDNNOp(op_list); } - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -206,9 +206,11 @@ void profile(bool use_mkldnn = false) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { PADDLE_ENFORCE_GT(outputs.size(), 0); - size_t size = GetSize(outputs[0]); + auto output = outputs.back(); + PADDLE_ENFORCE_GT(output.size(), 0); + size_t size = GetSize(output[0]); PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); + float *result = static_cast(output[0].data.data()); for (size_t i = 0; i < size; i++) { EXPECT_NEAR(result[i], result_data[i], 1e-3); } diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc index 5a4f9a31a164a8fca3f80ce2fe2e6065fd04b340..ece094717b8076321c68d7fdd29f07c4da6b0ed4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc @@ -17,8 +17,6 @@ limitations under the License. */ #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" -DEFINE_int32(iterations, 0, "Number of iterations"); - namespace paddle { namespace inference { namespace analysis { @@ -30,8 +28,13 @@ void SetConfig(AnalysisConfig *cfg) { cfg->SwitchIrOptim(); cfg->SwitchSpecifyInputNames(false); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); - cfg->EnableMKLDNN(); + cfg->pass_builder()->SetPasses( + {"infer_clean_graph_pass", "mkldnn_placement_pass", + "depthwise_conv_mkldnn_pass", "conv_bn_fuse_pass", + "conv_eltwiseadd_bn_fuse_pass", "conv_bias_mkldnn_fuse_pass", + "conv_elementwise_add_mkldnn_fuse_pass", "conv_relu_mkldnn_fuse_pass", + "fc_fuse_pass", "is_test_pass"}); } template @@ -40,8 +43,8 @@ class TensorReader { TensorReader(std::ifstream &file, size_t beginning_offset, std::vector shape, std::string name) : file_(file), position(beginning_offset), shape_(shape), name_(name) { - numel = - std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies()); + numel = std::accumulate(shape_.begin(), shape_.end(), size_t{1}, + std::multiplies()); } PaddleTensor NextBatch() { @@ -71,10 +74,14 @@ class TensorReader { }; std::shared_ptr> GetWarmupData( - const std::vector> &test_data, int num_images) { + const std::vector> &test_data, + int num_images = FLAGS_warmup_batch_size) { int test_data_batch_size = test_data[0][0].shape[0]; - CHECK_LE(static_cast(num_images), - test_data.size() * test_data_batch_size); + auto iterations_max = test_data.size(); + PADDLE_ENFORCE( + static_cast(num_images) <= iterations_max * test_data_batch_size, + "The requested quantization warmup data size " + + std::to_string(num_images) + " is bigger than all test data size."); PaddleTensor images; images.name = "input"; @@ -120,20 +127,17 @@ void SetInput(std::vector> *inputs, std::vector image_batch_shape{batch_size, 3, 224, 224}; std::vector label_batch_shape{batch_size, 1}; + auto images_offset_in_file = static_cast(file.tellg()); auto labels_offset_in_file = - static_cast(file.tellg()) + - sizeof(float) * total_images * - std::accumulate(image_batch_shape.begin() + 1, - image_batch_shape.end(), 1, std::multiplies()); + images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224; - TensorReader image_reader(file, 0, image_batch_shape, "input"); + TensorReader image_reader(file, images_offset_in_file, + image_batch_shape, "input"); TensorReader label_reader(file, labels_offset_in_file, label_batch_shape, "label"); - auto iterations = total_images / batch_size; - if (FLAGS_iterations > 0 && FLAGS_iterations < iterations) - iterations = FLAGS_iterations; - for (auto i = 0; i < iterations; i++) { + auto iterations_max = total_images / batch_size; + for (auto i = 0; i < iterations_max; i++) { auto images = image_reader.NextBatch(); auto labels = label_reader.NextBatch(); inputs->emplace_back( @@ -148,20 +152,21 @@ TEST(Analyzer_int8_resnet50, quantization) { AnalysisConfig q_cfg; SetConfig(&q_cfg); + // read data from file and prepare batches with test data std::vector> input_slots_all; - SetInput(&input_slots_all, 100); + SetInput(&input_slots_all); + // prepare warmup batch from input data read earlier + // warmup batch size can be different than batch size std::shared_ptr> warmup_data = - GetWarmupData(input_slots_all, 100); + GetWarmupData(input_slots_all); + // configure quantizer q_cfg.EnableMkldnnQuantizer(); q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); - q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100); + q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size); - CompareQuantizedAndAnalysis( - reinterpret_cast(&cfg), - reinterpret_cast(&q_cfg), - input_slots_all); + CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all); } } // namespace analysis diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 347672eaae314aa42096d48a3b044014f2ddbf84..142905dcd8d9964d93d0c5f7444823eef2b84900 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -124,7 +124,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_LAC, profile) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -137,11 +137,13 @@ TEST(Analyzer_LAC, profile) { 24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23}; - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t); PADDLE_ENFORCE_GE(size, batch1_size); - int64_t *pdata = static_cast(outputs[0].data.data()); + int64_t *pdata = static_cast(output[0].data.data()); for (size_t i = 0; i < batch1_size; ++i) { EXPECT_EQ(pdata[i], lac_ref_data[i]); } diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc index 089f655c180d784af66af60277bdbf32a6019599..2eb347a44b394a55706d5aa88bee7fe1fcc7838e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -96,7 +96,7 @@ void SetInput(std::vector> *inputs) { void profile(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; if (use_mkldnn) { cfg.EnableMKLDNN(); @@ -108,8 +108,9 @@ void profile(bool use_mkldnn = false) { input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { - PADDLE_ENFORCE_EQ(outputs.size(), 2UL); - for (auto &output : outputs) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + PADDLE_ENFORCE_EQ(outputs.back().size(), 2UL); + for (auto &output : outputs.back()) { size_t size = GetSize(output); PADDLE_ENFORCE_GT(size, 0); float *result = static_cast(output.data.data()); diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index a70aa7a6ac41121a0c8ea397ebc7e24e4b206d12..36e07d5f55600dc7aa96227289f707fb19f92d56 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -106,7 +106,7 @@ void SetInput(std::vector> *inputs) { void profile(bool memory_load = false) { AnalysisConfig cfg; SetConfig(&cfg, memory_load); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -117,10 +117,12 @@ void profile(bool memory_load = false) { // the first inference result const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, 48, 39, 38, 16, 25}; - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); PADDLE_ENFORCE_GT(size, 0); - int64_t *result = static_cast(outputs[0].data.data()); + int64_t *result = static_cast(output[0].data.data()); for (size_t i = 0; i < std::min(11UL, size); i++) { EXPECT_EQ(result[i], chinese_ner_result_data[i]); } diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index 5157bd280d0f3ee327d5cee7799477b5e6fd3f71..9443b08063b8f61d3d6b291a7217d645d8825c54 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -127,7 +127,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_Pyramid_DNN, profile) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -135,10 +135,12 @@ TEST(Analyzer_Pyramid_DNN, profile) { input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) { - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); + float *result = static_cast(output[0].data.data()); // output is probability, which is in (0, 1). for (size_t i = 0; i < size; i++) { EXPECT_GT(result[i], 0); diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 629981d565f1b6eeabc192287cb9f892df21b8e4..d4330e6cddf8818ace01be2f13a4c18a192c46e1 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -40,7 +40,7 @@ void profile(bool use_mkldnn = false) { if (use_mkldnn) { cfg.EnableMKLDNN(); } - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index dcf4b38ce8a9230148738cfd0840ca96b0c7cf8c..54fd3a4a4caba52110ab636e6d44ee2a473f0cb0 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -229,7 +229,7 @@ TEST(Analyzer_rnn1, profile) { SetConfig(&cfg); cfg.DisableGpu(); cfg.SwitchIrDebug(); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -280,7 +280,7 @@ TEST(Analyzer_rnn1, compare_determine) { TEST(Analyzer_rnn1, multi_thread) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc index 007f9f0b66a7b276f5f2e8500a3001788ad41e79..9ccbf58cbd2bbaab9b1a132c27e50356e1a5df37 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -126,7 +126,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_rnn2, profile) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -136,9 +136,11 @@ TEST(Analyzer_rnn2, profile) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result PADDLE_ENFORCE_GT(outputs.size(), 0); - size_t size = GetSize(outputs[0]); + auto output = outputs.back(); + PADDLE_ENFORCE_GT(output.size(), 0); + size_t size = GetSize(output[0]); PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); + float *result = static_cast(output[0].data.data()); for (size_t i = 0; i < size; i++) { EXPECT_NEAR(result[i], result_data[i], 1e-3); } diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index 47c1d7375843e4bad212c1d7d621c9e6d45e5982..9f23b9f037bcaeb758312d011067ae29c82e73cd 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -110,7 +110,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_seq_conv1, profile) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -119,10 +119,12 @@ TEST(Analyzer_seq_conv1, profile) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); + float *result = static_cast(output[0].data.data()); // output is probability, which is in (0, 1). for (size_t i = 0; i < size; i++) { EXPECT_GT(result[i], 0); diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index 19fa5528da4d11d2eb1a2f932f60a84c3f5468e7..d6f7f468a6c83bd6c4ac087931d0c6b7cac3cc1c 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -156,7 +156,7 @@ void profile(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg, use_mkldnn); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); TestPrediction(reinterpret_cast(&cfg), diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 2003be82019333ca97b9fa8ef83668825fe5710d..54492dbc238bbaf25f86b300fdd6585f74365088 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -70,7 +70,7 @@ TEST(Analyzer_Text_Classification, profile) { AnalysisConfig cfg; SetConfig(&cfg); cfg.SwitchIrDebug(); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -79,8 +79,9 @@ TEST(Analyzer_Text_Classification, profile) { if (FLAGS_num_threads == 1) { // Get output - LOG(INFO) << "get outputs " << outputs.size(); - for (auto &output : outputs) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + LOG(INFO) << "get outputs " << outputs.back().size(); + for (auto &output : outputs.back()) { LOG(INFO) << "output.shape: " << to_string(output.shape); // no lod ? CHECK_EQ(output.lod.size(), 0UL); diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc index a925da312cde30380b4997b8b76a0d425a71e817..bd4f1b61973fb0de06dcc288e329c94756d5ed47 100644 --- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc @@ -186,7 +186,7 @@ void SetInput(std::vector> *inputs) { void profile(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); - std::vector outputs; + std::vector> outputs; if (use_mkldnn) { cfg.EnableMKLDNN(); } diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index ca04c1365cbbffcb4a2786cde9ab240cc20aa3d8..fb47048cd0ccc887927cb4b533d45df11ef633eb 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -87,7 +87,7 @@ void profile(bool use_mkldnn = false) { cfg.EnableMKLDNN(); } // cfg.pass_builder()->TurnOnDebug(); - std::vector outputs; + std::vector> outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -100,7 +100,8 @@ void profile(bool use_mkldnn = false) { auto refer = ProcessALine(line); file.close(); - auto &output = outputs.front(); + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto &output = outputs.back().front(); size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); CHECK_EQ(numel, refer.data.size()); for (size_t i = 0; i < numel; ++i) { diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 33f1d0254858814be20eee1a6c2faaf00c2e8178..9a0dcc722cf00984b8c0e3ac20f13849e2904102 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -41,7 +41,10 @@ DEFINE_string(model_name, "", "model name"); DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_data, "", "data file"); DEFINE_string(refer_result, "", "reference result for comparison"); -DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(batch_size, 1, "batch size"); +DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup"); +// setting iterations to 0 means processing the whole dataset +DEFINE_int32(iterations, 0, "number of batches to process"); DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); @@ -239,7 +242,7 @@ void SetFakeImageInput(std::vector> *inputs, } input.shape = shape; input.dtype = PaddleDType::FLOAT32; - size_t len = std::accumulate(shape.begin(), shape.end(), 1, + size_t len = std::accumulate(shape.begin(), shape.end(), size_t{1}, [](int a, int b) { return a * b; }); input.data.Resize(len * sizeof(float)); input.lod.assign({{0, static_cast(FLAGS_batch_size)}}); @@ -286,17 +289,18 @@ void ConvertPaddleTensorToZeroCopyTensor( void PredictionWarmUp(PaddlePredictor *predictor, const std::vector> &inputs, - std::vector *outputs, int num_threads, - int tid) { + std::vector> *outputs, + int num_threads, int tid) { int batch_size = FLAGS_batch_size; LOG(INFO) << "Running thread " << tid << ", warm up run..."; if (FLAGS_zero_copy) { ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]); } + outputs->resize(1); Timer warmup_timer; warmup_timer.tic(); if (!FLAGS_zero_copy) { - predictor->Run(inputs[0], outputs, batch_size); + predictor->Run(inputs[0], &(*outputs)[0], batch_size); } else { predictor->ZeroCopyRun(); } @@ -308,11 +312,16 @@ void PredictionWarmUp(PaddlePredictor *predictor, void PredictionRun(PaddlePredictor *predictor, const std::vector> &inputs, - std::vector *outputs, int num_threads, - int tid) { - int batch_size = FLAGS_batch_size; + std::vector> *outputs, + int num_threads, int tid) { int num_times = FLAGS_repeat; - LOG(INFO) << "Thread " << tid << " run " << num_times << " times..."; + int iterations = inputs.size(); // process the whole dataset ... + if (FLAGS_iterations > 0 && FLAGS_iterations < inputs.size()) + iterations = + FLAGS_iterations; // ... unless the number of iterations is set + outputs->resize(iterations); + LOG(INFO) << "Thread " << tid << ", number of threads " << num_threads + << ", run " << num_times << " times..."; Timer run_timer; double elapsed_time = 0; #ifdef WITH_GPERFTOOLS @@ -320,14 +329,14 @@ void PredictionRun(PaddlePredictor *predictor, #endif if (!FLAGS_zero_copy) { run_timer.tic(); - for (size_t i = 0; i < inputs.size(); i++) { + for (size_t i = 0; i < iterations; i++) { for (int j = 0; j < num_times; j++) { - predictor->Run(inputs[i], outputs, batch_size); + predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size); } } elapsed_time = run_timer.toc(); } else { - for (size_t i = 0; i < inputs.size(); i++) { + for (size_t i = 0; i < iterations; i++) { ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]); run_timer.tic(); for (int j = 0; j < num_times; j++) { @@ -340,13 +349,14 @@ void PredictionRun(PaddlePredictor *predictor, ProfilerStop(); #endif - PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times, - inputs.size()); + auto batch_latency = elapsed_time / (iterations * num_times); + PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency, + iterations); if (FLAGS_record_benchmark) { Benchmark benchmark; benchmark.SetName(FLAGS_model_name); - benchmark.SetBatchSize(batch_size); - benchmark.SetLatency(elapsed_time / num_times); + benchmark.SetBatchSize(FLAGS_batch_size); + benchmark.SetLatency(batch_latency); benchmark.PersistToFile("benchmark_record.txt"); } } @@ -354,16 +364,17 @@ void PredictionRun(PaddlePredictor *predictor, void TestOneThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, - std::vector *outputs, bool use_analysis = true) { + std::vector> *outputs, bool use_analysis = true) { auto predictor = CreateTestPredictor(config, use_analysis); - PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0); - PredictionRun(predictor.get(), inputs, outputs, 1, 0); + PredictionWarmUp(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, + 0); + PredictionRun(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, 0); } void TestMultiThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, - std::vector *outputs, int num_threads, + std::vector> *outputs, int num_threads, bool use_analysis = true) { std::vector threads; std::vector> predictors; @@ -376,7 +387,7 @@ void TestMultiThreadPrediction( threads.emplace_back([&, tid]() { // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. - std::vector outputs_tid; + std::vector> outputs_tid; auto &predictor = predictors[tid]; #ifdef PADDLE_WITH_MKLDNN if (use_analysis) { @@ -384,8 +395,8 @@ void TestMultiThreadPrediction( ->SetMkldnnThreadID(static_cast(tid) + 1); } #endif - PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid); - PredictionRun(predictor.get(), inputs, outputs, num_threads, tid); + PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads, tid); + PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid); }); } for (int i = 0; i < num_threads; ++i) { @@ -395,8 +406,8 @@ void TestMultiThreadPrediction( void TestPrediction(const PaddlePredictor::Config *config, const std::vector> &inputs, - std::vector *outputs, int num_threads, - bool use_analysis = FLAGS_use_analysis) { + std::vector> *outputs, + int num_threads, bool use_analysis = FLAGS_use_analysis) { PrintConfig(config, use_analysis); if (num_threads == 1) { TestOneThreadPrediction(config, inputs, outputs, use_analysis); @@ -406,30 +417,41 @@ void TestPrediction(const PaddlePredictor::Config *config, } } -void CompareTopAccuracy(const std::vector &output_slots1, - const std::vector &output_slots2) { - // first output: avg_cost - if (output_slots1.size() == 0 || output_slots2.size() == 0) +void CompareTopAccuracy( + const std::vector> &output_slots_quant, + const std::vector> &output_slots_ref) { + if (output_slots_quant.size() == 0 || output_slots_ref.size() == 0) throw std::invalid_argument( "CompareTopAccuracy: output_slots vector is empty."); - PADDLE_ENFORCE(output_slots1.size() >= 2UL); - PADDLE_ENFORCE(output_slots2.size() >= 2UL); - // second output: acc_top1 - if (output_slots1[1].lod.size() > 0 || output_slots2[1].lod.size() > 0) - throw std::invalid_argument( - "CompareTopAccuracy: top1 accuracy output has nonempty LoD."); - if (output_slots1[1].dtype != paddle::PaddleDType::FLOAT32 || - output_slots2[1].dtype != paddle::PaddleDType::FLOAT32) - throw std::invalid_argument( - "CompareTopAccuracy: top1 accuracy output is of a wrong type."); - float *top1_quantized = static_cast(output_slots1[1].data.data()); - float *top1_reference = static_cast(output_slots2[1].data.data()); - LOG(INFO) << "top1 INT8 accuracy: " << *top1_quantized; - LOG(INFO) << "top1 FP32 accuracy: " << *top1_reference; + float total_accs1_quant{0}; + float total_accs1_ref{0}; + for (size_t i = 0; i < output_slots_quant.size(); ++i) { + PADDLE_ENFORCE(output_slots_quant[i].size() >= 2UL); + PADDLE_ENFORCE(output_slots_ref[i].size() >= 2UL); + // second output: acc_top1 + if (output_slots_quant[i][1].lod.size() > 0 || + output_slots_ref[i][1].lod.size() > 0) + throw std::invalid_argument( + "CompareTopAccuracy: top1 accuracy output has nonempty LoD."); + if (output_slots_quant[i][1].dtype != paddle::PaddleDType::FLOAT32 || + output_slots_ref[i][1].dtype != paddle::PaddleDType::FLOAT32) + throw std::invalid_argument( + "CompareTopAccuracy: top1 accuracy output is of a wrong type."); + total_accs1_quant += + *static_cast(output_slots_quant[i][1].data.data()); + total_accs1_ref += + *static_cast(output_slots_ref[i][1].data.data()); + } + float avg_acc1_quant = total_accs1_quant / output_slots_quant.size(); + float avg_acc1_ref = total_accs1_ref / output_slots_ref.size(); + + LOG(INFO) << "Avg top1 INT8 accuracy: " << std::fixed << std::setw(6) + << std::setprecision(4) << avg_acc1_quant; + LOG(INFO) << "Avg top1 FP32 accuracy: " << std::fixed << std::setw(6) + << std::setprecision(4) << avg_acc1_ref; LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy; - CHECK_LE(std::abs(*top1_quantized - *top1_reference), - FLAGS_quantized_accuracy); + CHECK_LE(std::abs(avg_acc1_quant - avg_acc1_ref), FLAGS_quantized_accuracy); } void CompareDeterministic( @@ -455,20 +477,35 @@ void CompareNativeAndAnalysis( const PaddlePredictor::Config *config, const std::vector> &inputs) { PrintConfig(config, true); - std::vector native_outputs, analysis_outputs; + std::vector> native_outputs, analysis_outputs; TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &analysis_outputs, true); - CompareResult(analysis_outputs, native_outputs); + PADDLE_ENFORCE(native_outputs.size() > 0, "Native output is empty."); + PADDLE_ENFORCE(analysis_outputs.size() > 0, "Analysis output is empty."); + CompareResult(analysis_outputs.back(), native_outputs.back()); } void CompareQuantizedAndAnalysis( - const PaddlePredictor::Config *config, - const PaddlePredictor::Config *qconfig, + const AnalysisConfig *config, const AnalysisConfig *qconfig, const std::vector> &inputs) { - PrintConfig(config, true); - std::vector analysis_outputs, quantized_outputs; - TestOneThreadPrediction(config, inputs, &analysis_outputs, true); - TestOneThreadPrediction(qconfig, inputs, &quantized_outputs, true); + PADDLE_ENFORCE_EQ(inputs[0][0].shape[0], FLAGS_batch_size, + "Input data has to be packed batch by batch."); + LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size + << ", warmup batch size " << FLAGS_warmup_batch_size << "."; + + LOG(INFO) << "--- FP32 prediction start ---"; + auto *cfg = reinterpret_cast(config); + PrintConfig(cfg, true); + std::vector> analysis_outputs; + TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true); + + LOG(INFO) << "--- INT8 prediction start ---"; + auto *qcfg = reinterpret_cast(qconfig); + PrintConfig(qcfg, true); + std::vector> quantized_outputs; + TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true); + + LOG(INFO) << "--- comparing outputs --- "; CompareTopAccuracy(quantized_outputs, analysis_outputs); } @@ -578,9 +615,9 @@ static bool CompareTensorData(const framework::LoDTensor &a, const framework::LoDTensor &b) { auto a_shape = framework::vectorize(a.dims()); auto b_shape = framework::vectorize(b.dims()); - size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1, + size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), size_t{1}, [](int a, int b) { return a * b; }); - size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1, + size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), size_t{1}, [](int a, int b) { return a * b; }); if (a_size != b_size) { LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d", diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index cb668a4174134ba3ce9517955ff740ada568e97b..98ce225a0476b38c021b0b81489f69d7953ae456 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -74,7 +74,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); } - std::vector outputs; + std::vector> outputs; if (use_analysis || use_tensorrt) { AnalysisConfig config; config.EnableUseGpu(100, 0); diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec index 568db2cf06d3f1993ebc540bf83a28de8d225bd1..f0e3d3e86f24cb68f6e9d41f48c9698b43bca13e 100644 --- a/paddle/fluid/op_use_default_grad_op_maker.spec +++ b/paddle/fluid/op_use_default_grad_op_maker.spec @@ -3,15 +3,11 @@ acos asin atan attention_lstm -bilinear_tensor_product brelu conv_shift cos cos_sim dequantize -elementwise_div -elementwise_max -elementwise_min elu fc flatten @@ -29,8 +25,6 @@ gelu gru hard_shrink hierarchical_sigmoid -hinge_loss -huber_loss leaky_relu log logsigmoid @@ -43,7 +37,6 @@ max_pool3d_with_index maxout modified_huber_loss nce -norm pool2d pool3d pow @@ -59,17 +52,7 @@ requantize reshape rnn_memory_helper round -row_conv -sequence_concat -sequence_conv -sequence_expand -sequence_expand_as -sequence_pad -sequence_scatter -sequence_slice sequence_softmax -sequence_unpad -sigmoid_cross_entropy_with_logits sin softplus softshrink @@ -84,7 +67,6 @@ stanh swish tanh_shrink teacher_student_sigmoid_loss -temporal_shift tensor_array_to_tensor thresholded_relu transpose diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc index 1de59a5165c83a314a0ff8f4e4351aa3326beb67..9d7100cc3db91f5bf7dbd993c9f9ba5d4fc98ea6 100644 --- a/paddle/fluid/operators/affine_grid_op.cc +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/affine_grid_op.h" +#include #include +#include #include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cudnn_helper.h" @@ -173,9 +175,10 @@ class AffineGridOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - auto theta_dims = ctx->GetInputDim("Theta"); if (ctx->HasOutput(framework::GradVarName("Theta"))) { - ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims); + auto output_dims = ctx->GetInputDim(framework::GradVarName("Output")); + ctx->SetOutputDim(framework::GradVarName("Theta"), + {output_dims[0], 2, 3}); } } diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h index fc15d56891cf7af10a91ca22a09c84fa2e52d465..7e2740f148f1d273310f44ed4a35d413e7201394 100644 --- a/paddle/fluid/operators/batch_size_like.h +++ b/paddle/fluid/operators/batch_size_like.h @@ -74,5 +74,8 @@ class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker { virtual void Apply() = 0; }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(BatchSizeLikeNoNeedBufferVarsInference, + "Input"); + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc index 8d261a118a75ee16027faf60341cefd30c3cdbba..bd69f422e5dbd5a5dc95150b10daa302f47ec5ff 100644 --- a/paddle/fluid/operators/bilinear_tensor_product_op.cc +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/bilinear_tensor_product_op.h" +#include +#include +#include namespace paddle { namespace operators { @@ -121,15 +124,9 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel { "The second dimension of input(Out@GRAD) must be equal to " "the third dimension of the Input(Weight)."); - if (ctx->HasInput("Bias")) { - auto bias_dims = ctx->GetInputDim("Bias"); - PADDLE_ENFORCE_EQ( - bias_dims[1], out_dims[1], - "The second dimension of input(Out@GRAD) must be equal to " - "the second dimension of the Input(Bias)."); - auto bias_grad_name = framework::GradVarName("Bias"); - if (ctx->HasOutput(bias_grad_name)) - ctx->SetOutputDim(bias_grad_name, bias_dims); + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) { + ctx->SetOutputDim(bias_grad_name, {1, out_dims[1]}); } auto x_grad_name = framework::GradVarName("X"); @@ -148,13 +145,39 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel { } }; +class BilinearTensorProductGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("bilinear_tensor_product_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput("Weight", Input("Weight")); + if (ForwardOp().Inputs().count("Bias") > 0) { + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + } + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp, ops::BilinearTensorProductOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::BilinearTensorProductGradOpDescMaker); REGISTER_OPERATOR(bilinear_tensor_product_grad, ops::BilinearTensorProductOpGrad); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 85612ba47448a7b0d712e9314e3980019c96e9c3..530a54b7ca186008bc8ec4b083254e65378ae619 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -13,10 +13,47 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_div_op.h" +#include +#include #include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { + +class ElementwiseDivOpMaker : public ElementwiseOpMaker { + protected: + std::string GetName() const override { return "Div"; } + std::string GetEquation() const override { return "Out = X / Y"; } +}; + +class ElementwiseDivGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("elementwise_div_grad"); + op->SetInput("Y", Input("Y")); + op->SetInput("Out", Output("Out")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y"); +REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp, + ops::ElementwiseDivOpMaker, ops::ElementwiseOpInferVarType, + ops::ElementwiseDivGradOpDescMaker); + +REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_div, diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index 8a07339077aeaa4403ffd1e1e30e0d58a9cc30e7..0f0ad8637301772f073bca305b9196b9c7865daf 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -47,7 +47,7 @@ struct DivGradDX { template struct DivGradDY { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return -dout * x / (y * y); + return -dout * out / y; } }; @@ -58,13 +58,15 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel { ElemwiseGradKernel::Compute(ctx); using Tensor = framework::Tensor; - auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); + + auto* x = dout; // Fake x, not used + ElemwiseGradCompute, DivGradDY>( ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX(), DivGradDY()); } diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc index ea0dcd736e5700fb0f341938ac3e3e3b178f29c1..b7df9c6f845dfc941e3c6acbc986a584e984a1de 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc @@ -13,9 +13,48 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_max_op.h" +#include +#include #include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { + +class ElementwiseMaxOpMaker : public ElementwiseOpMaker { + protected: + std::string GetName() const override { return "Max"; } + std::string GetEquation() const override { return "Out = max(X, Y)"; } +}; + +class ElementwiseMaxGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("elementwise_max_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_max, "Max", "Out = max(X, Y)"); + +REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp, + ops::ElementwiseMaxOpMaker, ops::ElementwiseOpInferVarType, + ops::ElementwiseMaxGradOpDescMaker); + +REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad); + REGISTER_OP_CPU_KERNEL( elementwise_max, ops::ElementwiseMaxKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h index 3ee0c32e0d5d5df02d5d157416918fb4fb3aca92..abdb1b9671de80d02b9a6a788088f47929fcc6f0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h @@ -63,10 +63,10 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + auto* out = dout; // Fake out, not used int axis = ctx.Attr("axis"); ElemwiseGradCompute, MaxGradDy>( ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx(), MaxGradDy()); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc index b263b9addd40cfd329d2cc8588c278df2cb008e9..f60c0ed8a0faad384f4eaa631c2758f83bc56414 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc @@ -13,9 +13,48 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_min_op.h" +#include +#include #include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { + +class ElementwiseMinOpMaker : public ElementwiseOpMaker { + protected: + std::string GetName() const override { return "Min"; } + std::string GetEquation() const override { return "Out = min(X, Y)"; } +}; + +class ElementwiseMinGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("elementwise_min_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_min, "Min", "Out = min(X, Y)"); + +REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp, + ops::ElementwiseMinOpMaker, ops::ElementwiseOpInferVarType, + ops::ElementwiseMinGradOpDescMaker); + +REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad); + REGISTER_OP_CPU_KERNEL( elementwise_min, ops::ElementwiseMinKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h index d04e372faaa4e6296e982afe6155cdde2fec4f81..1a49a6013987ae1ec685ec91ca656e4756ba7c32 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h @@ -62,10 +62,10 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + auto* out = dout; // Fake out, not used int axis = ctx.Attr("axis"); ElemwiseGradCompute, MinGradDy>( ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx(), MinGradDy()); diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 6dbb9072495f743a4df1ff05e029a227c2cf618b..95246b38f530ff5f81e1fbb5f1dd22149943c8ff 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -173,12 +173,12 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { using Tensor = framework::Tensor; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + auto out_grad_name = framework::GradVarName("Out"); PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + PADDLE_ENFORCE(ctx->HasInput(out_grad_name), "Input(Out@GRAD) should not be null"); - auto x_dims = ctx->GetInputDim("X"); + auto x_dims = ctx->GetInputDim(out_grad_name); auto y_dims = ctx->GetInputDim("Y"); PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), @@ -187,8 +187,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); if (ctx->HasOutput(x_grad_name)) { - ctx->ShareDim("X", /*->*/ x_grad_name); - ctx->ShareLoD("X", /*->*/ x_grad_name); + ctx->ShareDim(out_grad_name, /*->*/ x_grad_name); + ctx->ShareLoD(out_grad_name, /*->*/ x_grad_name); } if (ctx->HasOutput(y_grad_name)) { ctx->ShareDim("Y", /*->*/ y_grad_name); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc index 453a1b32a0171a2ca88879ab3287e89c4d3c7759..b8921b171cf37be17fb62d270a5c22f9d1806c64 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc @@ -46,6 +46,7 @@ obtained from the `input` tensor. )DOC"); } }; + } // namespace operators } // namespace paddle @@ -53,7 +54,8 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(fill_constant_batch_size_like, ops::FillConstantBatchSizeLikeOp, paddle::framework::EmptyGradOpMaker, - ops::FillConstantBatchSizeLikeOpMaker); + ops::FillConstantBatchSizeLikeOpMaker, + ops::BatchSizeLikeNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( fill_constant_batch_size_like, ops::FillConstantBatchSizeLikeOpKernel(ctx.Attr("dtype")), + ctx.GetPlace()); + } }; + +class FillZerosLikeOp2Maker : public FillZerosLikeOpMaker { + protected: + void ExtraMake() override { + this->AddAttr("dtype", + "(int, default 5(FP32)) " + "Output data type.") + .SetDefault(framework::proto::VarType::FP32); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(FillZerosLikeOp2NoNeedBufferVarsInference, + "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp, ops::FillZerosLikeOpMaker); + +REGISTER_OPERATOR(fill_zeros_like2, ops::FillZerosLikeOp2, + ops::FillZerosLikeOp2Maker, + ops::FillZerosLikeOp2NoNeedBufferVarsInference, + paddle::framework::EmptyGradOpMaker); + REGISTER_OP_CPU_KERNEL( fill_zeros_like, ops::FillZerosLikeKernel, @@ -58,3 +95,11 @@ REGISTER_OP_CPU_KERNEL( ops::FillZerosLikeKernel, ops::FillZerosLikeKernel, ops::FillZerosLikeKernel); + +REGISTER_OP_CPU_KERNEL( + fill_zeros_like2, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc index e80a703c30c0335124c089ea82ba4f6fe055acde..1831635def79b3ccb713dbc14cc70b8beeb609fc 100644 --- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc +++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc @@ -26,3 +26,13 @@ REGISTER_OP_CUDA_KERNEL( ops::FillZerosLikeKernel, ops::FillZerosLikeKernel); + +REGISTER_OP_CUDA_KERNEL( + fill_zeros_like2, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc index 98ebe1fdf4bb3308b2f07a073072031e79e14146..01302687a421165e908b2aa0646ba8b9c835034e 100644 --- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc +++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc @@ -65,17 +65,13 @@ by input arguments. } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( - GaussianRandomBatchSizeLikeNoNeedBufferVarsInference, "Input"); - } // namespace operators } // namespace paddle -REGISTER_OPERATOR( - gaussian_random_batch_size_like, - paddle::operators::GaussianRandomBatchSizeLikeOp, - paddle::operators::GaussianRandomBatchSizeLikeOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::operators::GaussianRandomBatchSizeLikeNoNeedBufferVarsInference); +REGISTER_OPERATOR(gaussian_random_batch_size_like, + paddle::operators::GaussianRandomBatchSizeLikeOp, + paddle::operators::GaussianRandomBatchSizeLikeOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::operators::BatchSizeLikeNoNeedBufferVarsInference); // Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 2ab40f482d7a1463703085037bcb94fd4aecf377..09fd6a25d18d5484f4d1c1631faae8da2fbd5473 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include +#include namespace paddle { namespace operators { @@ -107,8 +108,6 @@ class GroupNormGradOp : public framework::OperatorWithKernel { // check input PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of GroupNormOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Mean"), - "Input(Mean) of GroupNormOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Variance"), "Input(Variance) of GroupNormOp should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), @@ -159,7 +158,6 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker { op->SetInput("Bias", Input("Bias")); op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); op->SetInput("Y", Output("Y")); - op->SetInput("Mean", Output("Mean")); op->SetInput("Variance", Output("Variance")); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index f458ce6c83bfcfb56d558409b0802f27f13a4761..b6cfa9cc43c312e60a1b7c5e13d1ecbe6bc5dc7d 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/hinge_loss_op.h" +#include +#include +#include namespace paddle { namespace operators { @@ -97,12 +100,29 @@ class HingeLossGradOp : public framework::OperatorWithKernel { } }; +class HingeLossGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("hinge_loss_grad"); + op->SetInput("Logits", Input("Logits")); + op->SetInput("Labels", Input("Labels")); + op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::HingeLossGradOpDescMaker); REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp); REGISTER_OP_CPU_KERNEL( hinge_loss, diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc index 253b65a5f33308fc2c94537641b0fa19378b0cc9..a72db384c1f09f66ecf7ce85271d6263bbdcb523 100644 --- a/paddle/fluid/operators/huber_loss_op.cc +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/huber_loss_op.h" +#include +#include +#include namespace paddle { namespace operators { @@ -90,38 +93,45 @@ class HuberLossGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Residual"), - "Input(Residual) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null."); - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); auto residual_dims = ctx->GetInputDim("Residual"); - auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); - - PADDLE_ENFORCE_EQ(residual_dims, x_dims); - PADDLE_ENFORCE_EQ(out_grad_dims, x_dims); auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); + ctx->SetOutputDim(x_grad_name, residual_dims); } if (ctx->HasOutput(y_grad_name)) { - ctx->SetOutputDim(y_grad_name, y_dims); + ctx->SetOutputDim(y_grad_name, residual_dims); } } }; +class HuberLossGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("huber_loss_grad"); + op->SetInput("Residual", Output("Residual")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::HuberLossGradOpDescMaker); REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp); REGISTER_OP_CPU_KERNEL( huber_loss, ops::HuberLossKernel, diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index aa19c62c83648814e86b1e7062424be3693e4b98..81fbe3e514241ecdd2832141eba4250ced2017a9 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -13,6 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/norm_op.h" +#include +#include +#include + namespace paddle { namespace operators { @@ -74,6 +78,24 @@ class NormOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } }; + +class NormOpGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("norm_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Norm", Output("Norm")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return op; + } +}; + } // namespace operators } // namespace paddle @@ -81,7 +103,7 @@ namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::NormOpGradOpDescMaker); REGISTER_OPERATOR(norm_grad, ops::NormOpGrad); REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel, ops::NormKernel); diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc index 6ef2dacb3869ab3b20505699c2fbe2f129c20068..9731aefa95c5243e29ace87ad8c35d5b01904e60 100644 --- a/paddle/fluid/operators/pad2d_op.cc +++ b/paddle/fluid/operators/pad2d_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" @@ -612,8 +615,9 @@ class Pad2dOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.GetPlace()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); } }; @@ -625,7 +629,9 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker { std::unique_ptr Apply() const override { auto* bind = new framework::OpDesc(); bind->SetInput("X", Input("X")); - bind->SetInput("Paddings", Input("Paddings")); + if (ForwardOp().Inputs().count("Paddings") > 0) { + bind->SetInput("Paddings", Input("Paddings")); + } bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); bind->SetOutput(framework::GradVarName("X"), InputGrad("X")); bind->SetAttrMap(Attrs()); @@ -634,6 +640,10 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker { } }; +// TODO(zjl): Paddings can also be skipped! +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(Pad2dOpGradNoNeedBufferVarsInference, + "X"); + } // namespace operators } // namespace paddle @@ -641,6 +651,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(pad2d, ops::Pad2dOp, ops::Pad2dOpMaker, ops::Pad2dOpGradMaker); -REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad); +REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad, + ops::Pad2dOpGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL(pad2d, ops::Pad2dCPUKernel); REGISTER_OP_CPU_KERNEL(pad2d_grad, ops::Pad2dGradCPUKernel); diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc index d283bddbe9f974ac6835ee91d5a7851453687b80..81aabdd0061b3940f23d4731d55fc5cbe5817004 100644 --- a/paddle/fluid/operators/row_conv_op.cc +++ b/paddle/fluid/operators/row_conv_op.cc @@ -13,6 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/row_conv_op.h" +#include +#include +#include + #include "paddle/fluid/framework/eigen.h" namespace paddle { @@ -54,7 +58,6 @@ class RowConvGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); PADDLE_ENFORCE(ctx->HasInput("Filter"), "Input(Filter) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), @@ -62,8 +65,8 @@ class RowConvGradOp : public framework::OperatorWithKernel { auto x_grad_name = framework::GradVarName("X"); if (ctx->HasOutput(x_grad_name)) { - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim(x_grad_name, x_dims); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(x_grad_name, dout_dims); } auto filter_grad_name = framework::GradVarName("Filter"); @@ -259,12 +262,31 @@ class RowConvGradKernel } } }; + +class RowConvGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("row_conv_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("X", Input("X")); + op->SetInput("Filter", Input("Filter")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::RowConvGradOpDescMaker); REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp); REGISTER_OP_CPU_KERNEL( row_conv, ops::RowConvKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc index 37f1b9dda50ba4b62d7cf75765125e0ad663d9d8..d652f9216f8faf53deeac2c9ce1f737651c3939b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h" +#include #include namespace paddle { @@ -73,13 +74,43 @@ class SeqConcatShapeInferer : public framework::InferShapeBase { } }; -class SeqConcatGradShapeInferer : public framework::InferShapeBase { +class SeqConcatGradOpDescMaker : public framework::SingleGradOpDescMaker { public: - void operator()(framework::InferShapeContext *context) const override { + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_concat_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); + op->SetAttrMap(Attrs()); + return op; + } +}; + +class SeqConcatGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *context) const override { context->SetOutputsDim(framework::GradVarName("X"), context->GetInputsDim("X")); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SeqConcatGradNoNeedBufferVarsInference, + "X"); + } // namespace operators } // namespace paddle @@ -87,14 +118,14 @@ namespace op = paddle::operators; REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel, op::SeqConcatOpMaker, op::SeqConcatShapeInferer, - paddle::framework::DefaultGradOpDescMaker); + op::SeqConcatGradOpDescMaker); template using Kernel = op::SeqConcatKernel; REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel, Kernel); -REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel, - op::SeqConcatGradShapeInferer); +REGISTER_OPERATOR(sequence_concat_grad, op::SeqConcatGradOp, + op::SeqConcatGradNoNeedBufferVarsInference); template using GradKernel = op::SeqConcatGradKernel; diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h index ff035f421c4907ba940b973b3fd2a9421ed2dbae..f9b2ed3846a0f29bd2b058b944360a8fb66c24f8 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h @@ -14,7 +14,9 @@ #pragma once +#include #include +#include "boost/optional.hpp" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/math/concat_and_split.h" @@ -89,37 +91,49 @@ class SeqConcatGradKernel : public framework::OpKernel { dxs[i]->mutable_data(context.GetPlace()); } } + std::vector sliced_x; - std::vector> sliced_dx; + std::vector> sliced_dx; for (size_t i = 1; i < xs[0]->lod()[0].size(); ++i) { for (size_t j = 0; j < xs.size(); ++j) { const framework::LoDTensor *x = xs[j]; + framework::DDim x_dims = x->dims(); + framework::LoDTensor *dx = dxs[j]; auto &x_lod = x->lod()[0]; - sliced_x.emplace_back(x->Slice(x_lod[i - 1], x_lod[i])); - if (dx != nullptr) { - sliced_dx.emplace_back(dx->Slice(x_lod[i - 1], x_lod[i])); + + auto prev_lod = x_lod[i - 1]; + auto next_lod = x_lod[i]; + + x_dims[0] = next_lod - prev_lod; + + sliced_x.emplace_back(); + sliced_x.back().Resize(x_dims); + + if (dx) { + sliced_dx.emplace_back(dx->Slice(prev_lod, next_lod)); } else { - sliced_dx.emplace_back(boost::blank()); + sliced_dx.emplace_back(boost::none); } } } - math::SplitFunctor functor; std::vector sliced_x_ptr; - std::vector sliced_dx_ptr; + sliced_x_ptr.reserve(sliced_x.size()); for (auto &x : sliced_x) { sliced_x_ptr.emplace_back(&x); } + std::vector sliced_dx_ptr; + sliced_dx_ptr.reserve(sliced_dx.size()); for (auto &dx : sliced_dx) { - try { - sliced_dx_ptr.emplace_back(&boost::get(dx)); - } catch (boost::bad_get &) { - sliced_dx_ptr.emplace_back(nullptr); + if (dx) { + sliced_dx_ptr.emplace_back(&dx.get()); } } + + math::SplitFunctor functor; functor(context.template device_context(), detail::Ref( context.Input(framework::GradVarName("Out")), diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc index 65cd9edbc7125f605d6fb437a2e056054eb9a6d7..89c1fe834832802cc86dacd5a2d8c22bafa6072b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc @@ -15,6 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h" #include +#include +#include +#include namespace paddle { namespace operators { @@ -171,13 +174,57 @@ context_length, context_stride and context_start. } }; +class SequenceConvGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_conv_grad"); + op->SetAttrMap(Attrs()); + + if (boost::get(Attrs().at("paddingTrainable")) && + ForwardOp().Inputs().count("PaddingData") > 0) { + op->SetInput("PaddingData", Input("PaddingData")); + op->SetOutput(framework::GradVarName("PaddingData"), + InputGrad("PaddingData")); + } + + op->SetInput("X", Input("X")); + op->SetInput("Filter", Input("Filter")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + + return op; + } +}; + +class SequenceConvGradNoNeedBufferVarsInference + : public framework::NoNeedBufferVarsInference { + public: + using framework::NoNeedBufferVarsInference::NoNeedBufferVarsInference; + + std::unordered_set operator()() const override { + if (!boost::get(Attrs().at("paddingTrainable"))) { + return {"PaddingData"}; + } else { + return {}; + } + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp); + ops::SequenceConvGradOpDescMaker); + +REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp, + ops::SequenceConvGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_conv, diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc index 3b79d0c71975bb740b4085ce80f7d95b65f600c1..e1f6c3e3d599340acfa9bb5b47017b003721e4a3 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h" +#include +#include namespace paddle { namespace operators { @@ -70,6 +72,12 @@ class SequenceExpandAsOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", out_dims); ctx->ShareLoD("Y", /*->*/ "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } }; class SequenceExpandAsOpMaker : public framework::OpProtoAndCheckerMaker { @@ -131,7 +139,6 @@ class SequenceExpandAsOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null."); @@ -143,16 +150,48 @@ class SequenceExpandAsOpGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", x_grad_name); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; +class SequenceExpandAsOpGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_expand_as_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceExpandAsOpNoNeedBufferVarsInference, "Y"); +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceExpandAsGradOpNoNeedBufferVarsInference, "X", "Y"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_expand_as, ops::SequenceExpandAsOp, ops::SequenceExpandAsOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad); + ops::SequenceExpandAsOpGradOpDescMaker, + ops::SequenceExpandAsOpNoNeedBufferVarsInference); +REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad, + ops::SequenceExpandAsGradOpNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_expand_as, ops::SequenceExpandAsKernel, diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc index f6c42415301bc8d6f3509bfba2ff356265643bad..b7c0420636ab60e8a3e0a9332cbd3858aacda1b0 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" +#include namespace paddle { namespace operators { @@ -96,6 +97,12 @@ class SequenceExpandOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", out_dims); ctx->ShareLoD("X", /*->*/ "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } }; class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker { @@ -188,7 +195,6 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null."); @@ -199,16 +205,47 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(x_grad_name, x_dims); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; +class SequenceExpandOpGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_expand_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SequenceExpandOpNoNeedBufferVarsInference, + "Y"); +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceExpandGradOpNoNeedBufferVarsInference, "X", "Y"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad); + ops::SequenceExpandOpGradDescMaker, + ops::SequenceExpandOpNoNeedBufferVarsInference); +REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad, + ops::SequenceExpandGradOpNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_expand, ops::SequenceExpandKernel, diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc index 23c7bf7cea830bb0ccf5e81f99130043c2d5f80b..5290d0e6c6a2569e389345f61a0844ce3cbde10f 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h" +#include +#include namespace paddle { namespace operators { @@ -194,18 +196,39 @@ class SequencePadGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + auto data_type = framework::GetDataTypeOfVar( + ctx.InputVar(framework::GradVarName("Out"))); return framework::OpKernelType(data_type, ctx.device_context()); } }; +class SequencePadGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_pad_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequencePadGradOpNoNeedBufferVarsInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_pad, ops::SequencePadOp, ops::SequencePadOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp); + ops::SequencePadGradOpDescMaker); +REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp, + ops::SequencePadGradOpNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_pad, ops::SequencePadOpKernel, diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc index 1754221e7711b09c38f81c3f5803daa5372ed0dd..b4923571df95432d030d393a69d427f3ae17f298 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h" +#include #include namespace paddle { @@ -114,8 +115,9 @@ class SequencePoolGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -138,13 +140,17 @@ class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequencePoolGradOpNoNeedBufferVarsInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker, ops::SequencePoolGradOpMaker); -REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp); +REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp, + ops::SequencePoolGradOpNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_pool, ops::SequencePoolKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc index 8267c04f9f20511deba363f9a0aae761736ba90b..5a22212edf29cc79d28b12029dc7595ae5f1aab3 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_scatter_op.h" +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/gather.h" @@ -124,25 +125,49 @@ class SequenceScatterGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim(framework::GradVarName("Updates"), ctx->GetInputDim("Updates")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - platform::CPUPlace()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + platform::CPUPlace()); } }; +class SequenceScatterGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_scatter_grad"); + op->SetInput("Ids", Input("Ids")); + op->SetInput("Updates", Input("Updates")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceScatterGradNoNeedBufferVarsInference, "Updates"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_scatter, ops::SequenceScatterOp, ops::SequenceScatterOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_scatter_grad, ops::SequenceScatterGradOp); + ops::SequenceScatterGradDescMaker); +REGISTER_OPERATOR(sequence_scatter_grad, ops::SequenceScatterGradOp, + ops::SequenceScatterGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL(sequence_scatter, ops::SequenceScatterOpKernel, ops::SequenceScatterOpKernel, ops::SequenceScatterOpKernel, diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc index 35f49f78cedaca59d58ea19b909e5a950281c6e9..4b2ec6e7cad7c04e248c0ffbb117951fba1ec877 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h" +#include namespace paddle { namespace operators { @@ -70,8 +71,9 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -113,14 +115,35 @@ NOTE: The first dimension size of input, the size of offset and Length, should b } }; +class SequenceSliceGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_slice_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Offset", Input("Offset")); + op->SetInput("Length", Input("Length")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceSliceGradNoNeedBufferVarsInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_slice, ops::SequenceSliceOp, - ops::SequenceSliceOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp); + ops::SequenceSliceOpMaker, ops::SequenceSliceGradOpDescMaker); +REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp, + ops::SequenceSliceGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_slice, ops::SequenceSliceOpKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc index 2cf508e0b707ecc986886e72e5d42fde3c84894d..6c98a3e8731abb989f8dab97eff5c6ad56111742 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h" +#include +#include namespace paddle { namespace operators { @@ -125,19 +127,39 @@ class SequenceUnpadGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + auto data_type = framework::GetDataTypeOfVar( + ctx.InputVar(framework::GradVarName("Out"))); return framework::OpKernelType(data_type, ctx.device_context()); } }; +class SequenceUnpadGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_unpad_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceUnpadGradOpNoNeedBufferVarsInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp, - ops::SequenceUnpadOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp); + ops::SequenceUnpadOpMaker, ops::SequenceUnpadGradOpDescMaker); +REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp, + ops::SequenceUnpadGradOpNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_unpad, ops::SequenceUnpadOpKernel, diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h index 07df3dca831d7e646050ae57402c1a493c2e50e9..fe8ca41b698159a782547ce673a374d074d3b73d 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h @@ -81,10 +81,9 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel { auto* d_x = ctx.Output(framework::GradVarName("X")); if (d_x) { const auto* d_out = ctx.Input(framework::GradVarName("Out")); - const auto* x_t = ctx.Input("X"); d_x->mutable_data(ctx.GetPlace()); - int padded_length = x_t->dims()[1]; + int padded_length = d_x->dims()[1]; LoDTensor zero_pads; zero_pads.Resize({1, 1}); diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 26355e58615454c8e9aea1d6a5405368e6006e87..ad6fb3510f02ae783c8ae4318f559a8db74a59d1 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -11,6 +11,7 @@ limitations under the License. */ #include "paddle/fluid/operators/shuffle_channel_op.h" #include +#include namespace paddle { namespace operators { @@ -73,12 +74,7 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@Grad) should not be null"); - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), - "Output(X@Grad) should not be null"); - - auto input_dims = ctx->GetInputDim("X"); + auto input_dims = ctx->GetInputDim(framework::GradVarName("Out")); PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); ctx->SetOutputDim(framework::GradVarName("X"), input_dims); @@ -87,8 +83,9 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -100,7 +97,6 @@ class ShuffleChannelGradDescMaker : public framework::SingleGradOpDescMaker { std::unique_ptr Apply() const override { std::unique_ptr op(new framework::OpDesc()); op->SetType("shuffle_channel_grad"); - op->SetInput("X", Input("X")); op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetAttrMap(Attrs()); diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu index 9506343b3d508459c6e10dc68eba13504b07338f..dbc3e1a7ebe26ffccd24d1749093d014751d866f 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cu +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -78,10 +78,14 @@ template class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + int group = ctx.Attr("group"); - auto input_dims = input->dims(); + const auto& input_dims = input_grad->dims(); auto num = input_dims[0]; auto channel = input_dims[1]; auto height = input_dims[2]; @@ -91,10 +95,7 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { int group_row = group; int group_column = channel / group_row; - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); const T* output_grad_data = output_grad->data(); diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h index f6af1bc88598870ebccef81bd37f93f376940851..3ce1e0c770bb3fe6c4b0a54dad14e47f372958af 100644 --- a/paddle/fluid/operators/shuffle_channel_op.h +++ b/paddle/fluid/operators/shuffle_channel_op.h @@ -57,10 +57,14 @@ template class ShuffleChannelGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + int group = ctx.Attr("group"); - auto input_dims = input->dims(); + const auto& input_dims = input_grad->dims(); auto num = input_dims[0]; auto channel = input_dims[1]; auto height = input_dims[2]; @@ -71,10 +75,6 @@ class ShuffleChannelGradOpKernel : public framework::OpKernel { int group_row = group; int group_column = channel / group_row; - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); const T* output_grad_data = output_grad->data(); for (int n = 0; n < num; ++n) { diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index c21b0c13c752b82b80c120cb5a5d4a010ef18287..5c92588cc1d073612d2f6a7b315edf16cc14bedd 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include +#include +#include namespace paddle { namespace operators { @@ -139,6 +142,24 @@ However the output only shares the LoD with input `X`. } }; +class SigmoidCrossEntropyWithLogitsGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sigmoid_cross_entropy_with_logits_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Label", Input("Label")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle @@ -146,7 +167,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp, ops::SigmoidCrossEntropyWithLogitsOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::SigmoidCrossEntropyWithLogitsGradOpDescMaker); REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradOp); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 94995fc99612adb1164e60f1a51747f74eacfb73..589c98e51e32bc9eb7d6ccfb721a6a5f091470cf 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/slice_op.h" #include +#include #include namespace paddle { @@ -135,6 +136,13 @@ class SliceOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(x_grad_name, x_dims); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; class SliceOpGradMaker : public framework::SingleGradOpDescMaker { @@ -153,13 +161,17 @@ class SliceOpGradMaker : public framework::SingleGradOpDescMaker { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SliceOpGradNoNeedBufferVarsInference, + "Input"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker, ops::SliceOpGradMaker); -REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad); +REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad, + ops::SliceOpGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( slice, ops::SliceKernel, diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc index 7df649fc5b7bf8671303a28d727be1d85c1fa6e4..3b7d90b795b45d97dfdbe90f7e37ea28b942f2a0 100644 --- a/paddle/fluid/operators/temporal_shift_op.cc +++ b/paddle/fluid/operators/temporal_shift_op.cc @@ -10,6 +10,9 @@ limitations under the License. */ #include "paddle/fluid/operators/temporal_shift_op.h" +#include +#include +#include #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -125,19 +128,32 @@ class TemporalShiftOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); - auto dim_x = ctx->GetInputDim("X"); if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), dim_x); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); } } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.GetPlace()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } +}; + +class TemporalShiftGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("temporal_shift_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; } }; @@ -146,8 +162,7 @@ class TemporalShiftOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(temporal_shift, ops::TemporalShiftOp, - ops::TemporalShiftOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::TemporalShiftOpMaker, ops::TemporalShiftGradOpDescMaker); REGISTER_OPERATOR(temporal_shift_grad, ops::TemporalShiftOpGrad); REGISTER_OP_CPU_KERNEL(temporal_shift, ops::TemporalShiftKernel, ops::TemporalShiftKernel); diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc index 75d6181749e4e9bd81a3c02de69caf0acd81eef9..7260fe25d6ebb357040af8774c574b767bfd9f13 100644 --- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc +++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc @@ -64,8 +64,9 @@ with random values sampled from a uniform distribution. } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT( - uniform_random_batch_size_like, - paddle::operators::UniformRandomBatchSizeLikeOp, - paddle::operators::UniformRandomBatchSizeLikeOpMaker); +REGISTER_OPERATOR(uniform_random_batch_size_like, + paddle::operators::UniformRandomBatchSizeLikeOp, + paddle::operators::UniformRandomBatchSizeLikeOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::operators::BatchSizeLikeNoNeedBufferVarsInference); // Kernels are registered in uniform_random_op.cc and uniform_random_op.cu diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 5b79b759d555934bbc40a03da316d138f4e81a99..044677fb756e0368c65b84f15fdf2540abbd14b8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1299,7 +1299,20 @@ All parameter, weight, gradient are variables in Paddle. to fuse relu and depthwise_conv2d, it will save GPU memory and may make the execution faster. This options is only available in GPU devices. - Default False)DOC") + Default False.)DOC") + .def_property( + "fuse_broadcast_ops", + [](const BuildStrategy &self) { return self.fuse_broadcast_ops_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + self.fuse_broadcast_ops_ = b; + }, + R"DOC(The type is BOOL, fuse_broadcast_op indicates whether + to fuse the broadcast ops. Note that, in Reduce mode, + fusing broadcast ops may make the program faster. Because + fusing broadcast OP equals delaying the execution of all + broadcast Ops, in this case, all nccl streams are used only + for NCCLReduce operations for a period of time. Default False.)DOC") .def_property("fuse_all_optimizer_ops", [](const BuildStrategy &self) { return self.fuse_all_optimizer_ops_; diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 6303be003a701e57a8aa1e2f925459f416cdb543..9fd53a74bf51929f9e115fdc94f2f85f8e2fbdda 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -231,9 +231,16 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): for idx, op_desc in enumerate(op_descs): for arg in op_desc.input_arg_names(): if core.grad_var_suffix() in arg and arg in no_grad_set: - to_insert.append((_create_op_desc_("fill_zeros_like", { - "X": [_strip_grad_suffix_(arg)] - }, {"Out": [arg]}, {}), idx)) + x_in = _strip_grad_suffix_(arg) + x_in_var_desc = op_desc.block().find_var_recursive( + cpt.to_bytes(x_in)) + assert x_in_var_desc is not None, "Variable {} not found".format( + x_in) + dtype = x_in_var_desc.dtype() + + to_insert.append( + (_create_op_desc_("fill_zeros_like2", {"X": [x_in]}, + {"Out": [arg]}, {"dtype": dtype}), idx)) list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)]) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 7665703a726050b2a236b5d96bbafb6c24b2617a..bf3a16addcb532e7b6a781e3845e0f5791ac5d71 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -15,19 +15,20 @@ from __future__ import print_function from six.moves import reduce -import numpy as np from .. import core from ..layers import utils from . import layers -from ..framework import Variable, OpProtoHolder, Parameter -from ..layers import layer_function_generator +from ..framework import Variable, _in_dygraph_mode, OpProtoHolder, Parameter from ..param_attr import ParamAttr from ..initializer import Normal, Constant, NumpyArrayInitializer +import numpy as np __all__ = [ - 'Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', 'LayerNorm', - 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', 'SequenceConv' + 'Conv2D', 'Conv3D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', + 'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', + 'Conv3DTranspose', 'SequenceConv', 'RowConv', 'GroupNorm', 'SpectralNorm', + 'TreeConv' ] @@ -240,6 +241,303 @@ class Conv2D(layers.Layer): return self._helper.append_activation(pre_act, act=self._act) +class Conv3D(layers.Layer): + """ + **Convlution3D Layer** + + The convolution3D layer calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input(Input) and + Output(Output) are in NCDHW format. Where N is batch size C is the number of + channels, D is the depth of the feature, H is the height of the feature, + and W is the width of the feature. Convlution3D is similar with Convlution2D + but adds one dimension(depth). If bias attribution and activation type are + provided, bias is added to the output of the convolution, and the + corresponding activation function is applied to the final result. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma (W \\ast X + b) + + In the above equation: + + * :math:`X`: Input value, a tensor with NCDHW format. + * :math:`W`: Filter value, a tensor with MCDHW format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` + + Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)` + + - Output: + Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\ + H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1 + + Args: + input (Variable): The input image with [N, C, D, H, W] format. + num_filters(int): The number of filter. It is as same as the output + image channel. + filter_size (int|tuple|None): The filter size. If filter_size is a tuple, + it must contain three integers, (filter_size_D, filter_size_H, filter_size_W). + Otherwise, the filter will be a square. + stride (int|tuple): The stride size. If stride is a tuple, it must + contain three integers, (stride_D, stride_H, stride_W). Otherwise, the + stride_D = stride_H = stride_W = stride. Default: stride = 1. + padding (int|tuple): The padding size. If padding is a tuple, it must + contain three integers, (padding_D, padding_H, padding_W). Otherwise, the + padding_D = padding_H = padding_W = padding. Default: padding = 0. + dilation (int|tuple): The dilation size. If dilation is a tuple, it must + contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the + dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1. + groups (int): The groups number of the Conv3d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1 + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv3d. If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as param_attr. If it is set to None, the parameter + is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is + :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act (str): Activation type, if it is set to None, activation is not appended. + Default: None. + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Default: None. + + Returns: + Variable: The tensor variable storing the convolution and \ + non-linearity activation result. + + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32') + conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu") + """ + + def __init__(self, + name_scope, + num_filters, + filter_size, + stride=1, + padding=0, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None): + assert param_attr is not False, "param_attr should not be False here." + super(Conv3D, self).__init__(name_scope) + self._groups = groups + self._stride = utils.convert_to_list(stride, 3, 'stride') + self._padding = utils.convert_to_list(padding, 3, 'padding') + self._dilation = utils.convert_to_list(dilation, 3, 'dilation') + self._act = act + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + self._use_cudnn = use_cudnn + self._filter_size = filter_size + self._num_filters = num_filters + self._param_attr = param_attr + self._bias_attr = bias_attr + + def _build_once(self, input): + num_channels = input.shape[1] + self._dtype = self._helper.input_dtype(input) + + if self._groups is None: + num_filter_channels = num_channels + else: + if num_channels % self._groups != 0: + raise ValueError("num_channels must be divisible by groups.") + num_filter_channels = num_channels // self._groups + + filter_size = utils.convert_to_list(self._filter_size, 3, 'filter_size') + + filter_shape = [self._num_filters, num_filter_channels] + filter_size + + def _get_default_param_initializer(): + filter_elem_num = filter_size[0] * filter_size[1] * filter_size[ + 2] * num_channels + std = (2.0 / filter_elem_num)**0.5 + return Normal(0.0, std, 0) + + self._filter_param = self.create_parameter( + attr=self._param_attr, + shape=filter_shape, + dtype=self._dtype, + default_initializer=_get_default_param_initializer()) + + self._bias_param = self.create_parameter( + attr=self._bias_attr, + shape=[self._num_filters], + dtype=self._dtype, + is_bias=True) + + def forward(self, input): + pre_bias = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type='conv3d', + inputs={ + 'Input': input, + 'Filter': self._filter_param, + }, + outputs={"Output": pre_bias}, + attrs={ + 'strides': self._stride, + 'paddings': self._padding, + 'dilations': self._dilation, + 'groups': self._groups if self._groups else 1, + 'use_cudnn': self._use_cudnn, + 'use_mkldnn': False + }) + + pre_act = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type='elementwise_add', + inputs={'X': [pre_bias], + 'Y': [self._bias_param]}, + outputs={'Out': [pre_act]}, + attrs={'axis': 1}) + + return self._helper.append_activation(pre_act, act=self._act) + + +class Conv3DTranspose(layers.Layer): + def __init__(self, + name_scope, + num_filters, + output_size=None, + filter_size=None, + padding=0, + stride=1, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + name=None): + super(Conv3DTranspose, self).__init__(name_scope) + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + assert param_attr is not False, "param_attr should not be False in conv3d_transpose." + self._padding = utils.convert_to_list(padding, 3, 'padding') + self._stride = utils.convert_to_list(stride, 3, 'stride') + self._dilation = utils.convert_to_list(dilation, 3, 'dilation') + self._param_attr = param_attr + self._filter_size = filter_size + self._output_size = output_size + self._groups = 1 if groups is None else groups + self._num_filters = num_filters + self._use_cudnn = use_cudnn + self._bias_attr = bias_attr + self._act = act + + def _build_once(self, input): + self._dtype = self._helper.input_dtype(input) + self._input_channel = input.shape[1] + + if self._filter_size is None: + if self._output_size is None: + raise ValueError( + "output_size must be set when filter_size is None") + if isinstance(self._output_size, int): + self._output_size = [self._output_size, self._output_size] + + d_in = input.shape[2] + h_in = input.shape[3] + w_in = input.shape[4] + + filter_size_d = (self._output_size[0] - + (d_in - 1) * self._stride[0] + 2 * self._padding[0] + - 1) // self._dilation[0] + 1 + filter_size_h = (self._output_size[1] - + (h_in - 1) * self._stride[1] + 2 * self._padding[1] + - 1) // self._dilation[1] + 1 + filter_size_w = (self._output_size[2] - + (w_in - 1) * self._stride[2] + 2 * self._padding[2] + - 1) // self._dilation[2] + 1 + self._filter_size = [filter_size_d, filter_size_h, filter_size_w] + else: + self._filter_size = utils.convert_to_list( + self._filter_size, 3, 'conv3d_transpose.filter_size') + + filter_shape = [ + self._input_channel, self._num_filters // self._groups + ] + self._filter_size + self._img_filter = self.create_parameter( + dtype=self._dtype, shape=filter_shape, attr=self._param_attr) + if self._bias_attr: + self._bias_param = self.create_parameter( + attr=self._bias_attr, + shape=[self._num_filters], + dtype=self._dtype, + is_bias=True) + + def forward(self, input): + pre_bias = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + self._helper.append_op( + type="conv3d_transpose", + inputs={'Input': [input], + 'Filter': [self._img_filter]}, + outputs={'Output': pre_bias}, + attrs={ + 'strides': self._stride, + 'paddings': self._padding, + 'dilations': self._dilation, + 'groups': self._groups if self._groups else 1, + 'use_cudnn': self._use_cudnn + }) + + if self._bias_attr: + pre_act = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + self._helper.append_op( + type='elementwise_add', + inputs={'X': [pre_bias], + 'Y': [self._bias_param]}, + outputs={'Out': [pre_act]}, + attrs={'axis': 1}) + else: + pre_act = pre_bias + + # Currently, we don't support inplace in imperative mode + return self._helper.append_activation(pre_act, act=self._act) + + class Pool2D(layers.Layer): """ ${comment} @@ -1720,6 +2018,8 @@ class SequenceConv(layers.Layer): bias_attr=None, param_attr=None, act=None): + assert not _in_dygraph_mode( + ), "SequenceConv is not supported by dynamic graph mode yet!" super(SequenceConv, self).__init__(name_scope) self._num_filters = num_filters self._filter_size = filter_size @@ -1729,12 +2029,10 @@ class SequenceConv(layers.Layer): self._param_attr = param_attr def _build_once(self, input): - self._dtype = self._helper.input_dtype(input) - print(self._filter_size) filter_shape = [self._filter_size * input.shape[1], self._num_filters] self._filter_param = self.create_parameter( - attr=self.param_attr, shape=filter_shape, dtype=self._dtype) + attr=self._param_attr, shape=filter_shape, dtype=self._dtype) def forward(self, input): pre_bias = self._helper.create_variable_for_type_inference(self._dtype) @@ -1752,3 +2050,237 @@ class SequenceConv(layers.Layer): }) pre_act = self._helper.append_bias_op(pre_bias) return self._helper.append_activation(pre_act) + + +class RowConv(layers.Layer): + def __init__(self, + name_scope, + future_context_size, + param_attr=None, + act=None): + assert not _in_dygraph_mode( + ), "RowConv is not supported by dynamic graph mode yet!" + super(RowConv, self).__init__(name_scope) + self._act = act + self._param_attr = param_attr + self._future_context_size = future_context_size + + def _build_once(self, input): + self._dtype = self._helper.input_dtype(input) + filter_shape = [self._future_context_size + 1, input.shape[1]] + self._filter_param = self.create_parameter( + attr=self._param_attr, + shape=filter_shape, + dtype=self._dtype, + is_bias=False) + + def forward(self, input): + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type='row_conv', + inputs={'X': [input], + 'Filter': [self._filter_param]}, + outputs={'Out': [out]}) + return self._helper.append_activation(out, act=self._act) + + +class GroupNorm(layers.Layer): + """ + **Group Normalization Layer** + + Refer to `Group Normalization `_ . + + Args: + name_scope (str): See base class. + groups(int): The number of groups that divided from channels. + epsilon(float): The small value added to the variance to prevent + division by zero. + param_attr(ParamAttr|None): The parameter attribute for the learnable + scale :math:`g`. If it is set to False, no scale will be added to the output units. + If it is set to None, the bias is initialized one. Default: None. + bias_attr(ParamAttr|None): The parameter attribute for the learnable + bias :math:`b`. If it is set to False, no bias will be added to the output units. + If it is set to None, the bias is initialized zero. Default: None. + act(str): Activation to be applied to the output of group normalizaiton. + data_layout(string|NCHW): Only NCHW is supported. + dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc + + Returns: + Variable: A tensor variable which is the result after applying group normalization on the input. + + + """ + + def __init__(self, + name_scope, + groups, + epsilon=1e-05, + param_attr=None, + bias_attr=None, + act=None, + data_layout='NCHW'): + super(GroupNorm, self).__init__(name_scope) + self._param_attr = param_attr + self._bias_attr = bias_attr + self._epsilon = epsilon + self._groups = groups + self._act = act + if data_layout != 'NCHW': + raise ValueError("unsupported data layout:" + data_layout) + + def _build_once(self, input): + self._dtype = self._helper.input_dtype(input) + param_shape = [input.shape[1]] + if self._bias_attr: + self._bias = self.create_parameter( + attr=self._bias_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=True) + + if self._param_attr: + self._scale = self.create_parameter( + attr=self._param_attr, + shape=param_shape, + dtype=self._dtype, + default_initializer=Constant(1.0)) + + def forward(self, input): + inputs = {'X': input} + if self._bias: + inputs['Bias'] = self._bias + if self._scale: + inputs['Scale'] = self._scale + + # create output + mean_out = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + variance_out = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + group_norm_out = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type="group_norm", + inputs=inputs, + outputs={ + "Y": group_norm_out, + "Mean": mean_out, + "Variance": variance_out, + }, + attrs={"epsilon": self._epsilon, + "groups": self._groups}) + + return self._helper.append_activation(group_norm_out, self._act) + + +class SpectralNorm(layers.Layer): + def __init__(self, name_scope, dim=0, power_iters=1, eps=1e-12, name=None): + super(SpectralNorm, self).__init__(name_scope) + self._power_iters = power_iters + self._eps = eps + self._dim = dim + + def _build_once(self, weight): + self._dtype = self._helper.input_dtype(weight) + input_shape = weight.shape + h = input_shape[self._dim] + w = np.prod(input_shape) // h + + self.u = self.create_parameter( + attr=ParamAttr(), + shape=[h], + dtype=self._dtype, + default_initializer=Normal(0., 1.)) + self.u.stop_gradient = True + + self.v = self.create_parameter( + attr=ParamAttr(), + shape=[w], + dtype=self._dtype, + default_initializer=Normal(0., 1.)) + self.v.stop_gradient = True + + def forward(self, weight): + inputs = {'Weight': weight, 'U': self.u, 'V': self.v} + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="spectral_norm", + inputs=inputs, + outputs={"Out": out, }, + attrs={ + "dim": self._dim, + "power_iters": self._power_iters, + "eps": self._eps, + }) + + return out + + +class TreeConv(layers.Layer): + def __init__(self, + name_scope, + output_size, + num_filters=1, + max_depth=2, + act='tanh', + param_attr=None, + bias_attr=None, + name=None): + super(TreeConv, self).__init__(name_scope) + self._name = name + self._output_size = output_size + self._act = act + self._max_depth = max_depth + self._num_filters = num_filters + self._bias_attr = bias_attr + self._param_attr = param_attr + + def _build_once(self, nodes_vector, edge_set): + assert isinstance(nodes_vector, Variable) + assert isinstance(edge_set, Variable) + self._dtype = self._helper.input_dtype(nodes_vector) + + feature_size = nodes_vector.shape[2] + w_shape = [feature_size, 3, self._output_size, self._num_filters] + if self._bias_attr: + self._bias_param = self.create_parameter( + attr=self._bias_attr, + shape=[self._num_filters], + dtype=self._dtype, + is_bias=True) + self.W = self.create_parameter( + attr=self._param_attr, + shape=w_shape, + dtype=self._dtype, + is_bias=False) + + def forward(self, nodes_vector, edge_set): + if self._name: + out = self.create_variable( + name=self._name, dtype=self._dtype, persistable=False) + else: + out = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type='tree_conv', + inputs={ + 'NodesVector': nodes_vector, + 'EdgeSet': edge_set, + 'Filter': self.W + }, + outputs={'Out': out, }, + attrs={'max_depth': self._max_depth}) + if self._bias_attr: + pre_activation = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + self._helper.append_op( + type='elementwise_add', + inputs={'X': [out], + 'Y': [self._bias_param]}, + outputs={'Out': [pre_activation]}, + attrs={'axis': 1}) + else: + pre_activation = out + return self._helper.append_activation(pre_activation, act=self._act) diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index fd07ff0ba3d21721fbbc46099f7dcb6937f93524..c7c82f28e7c441b4aa24ffa81a8695e565d737d8 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -227,7 +227,7 @@ class Precision(MetricBase): metric.reset() for data in train_reader(): loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) - metric.update(preds=preds, labels=labels) + metric.update(preds=preds, labels=labels) numpy_precision = metric.eval() """ @@ -241,9 +241,11 @@ class Precision(MetricBase): raise ValueError("The 'preds' must be a numpy ndarray.") if not _is_numpy_(labels): raise ValueError("The 'labels' must be a numpy ndarray.") - sample_num = labels[0] + sample_num = labels.shape[0] + preds = np.rint(preds).astype("int32") + for i in range(sample_num): - pred = preds[i].astype("int32") + pred = preds[i] label = labels[i] if label == 1: if pred == label: diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 1390e759d7e309674a2ecb61c59043b0f5032400..0291bc25ed3145a580582fa30a965fd76047daba 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -81,6 +81,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_resnet) list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext) list(REMOVE_ITEM TEST_OPS test_imperative_mnist) list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer) +list(REMOVE_ITEM TEST_OPS test_layers) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) @@ -118,7 +119,7 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) - +py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1) if(NOT WIN32) py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL) endif() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..a84ff1fd6d46c30ad7aa72f1b29a8ae668b90e20 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py @@ -0,0 +1,48 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.fluid as fluid +import importlib + +fluid.core._set_eager_deletion_mode(0.0, 1.0, True) + +from test_elementwise_add_op import * +from test_elementwise_sub_op import * +from test_concat_op import * +from test_gather_op import * +from test_gaussian_random_batch_size_like_op import * +from test_uniform_random_batch_size_like_op import * +from test_fill_constant_batch_size_like_op import * +from test_lod_reset_op import * +from test_scatter_op import * +from test_mean_op import * +from test_slice_op import * +from test_linear_chain_crf_op import * +from test_bilinear_interp_op import * +from test_nearest_interp_op import * +from test_sequence_concat import * +from test_seq_conv import * +from test_seq_pool import * +from test_sequence_expand_as import * +from test_sequence_expand import * +from test_sequence_pad_op import * +from test_sequence_unpad_op import * +from test_sequence_scatter_op import * +from test_sequence_slice_op import * +from test_pad2d_op import * +from test_fill_zeros_like2_op import * + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py new file mode 100644 index 0000000000000000000000000000000000000000..935653b07a6a4e1d344e8040fa4a0ed72b9b164d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py @@ -0,0 +1,50 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.framework import convert_np_dtype_to_dtype_ +from op_test import OpTest + + +class TestFillZerosLike2Op(OpTest): + def setUp(self): + self.op_type = "fill_zeros_like2" + self.dtype = np.float32 + self.init_dtype() + self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)} + self.outputs = {'Out': np.zeros_like(self.inputs["X"])} + self.attrs = {'dtype': convert_np_dtype_to_dtype_(self.dtype)} + + def init_dtype(self): + pass + + def test_check_output(self): + self.check_output() + + +class TestFillZerosLike2OpFp16(TestFillZerosLike2Op): + def init_dtype(self): + self.dtype = np.float16 + + +class TestFillZerosLike2OpFp64(TestFillZerosLike2Op): + def init_dtype(self): + self.dtype = np.float64 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 959b48d03cac533af55017f2c7297cc1ac006f51..576ca57a3c518b68540ea8e682118bf4b731c308 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -348,6 +348,55 @@ class TestImperative(unittest.TestCase): self.assertEqual(mlp._fc2, sublayers[1]) self.assertEqual(len(sublayers), 2) + def test_dygraph_vs_static(self): + inp1 = np.random.rand(4, 3, 3) + inp2 = np.random.rand(4, 3, 3) + + # dynamic graph + with fluid.dygraph.guard(): + if np.sum(inp1) < np.sum(inp2): + x = fluid.layers.elementwise_add(inp1, inp2) + else: + x = fluid.layers.elementwise_sub(inp1, inp2) + dygraph_result = x._numpy() + + # static graph + with new_program_scope(): + inp_data1 = fluid.layers.data( + name='inp1', shape=[3, 3], dtype=np.float32) + inp_data2 = fluid.layers.data( + name='inp2', shape=[3, 3], dtype=np.float32) + + a = fluid.layers.expand( + fluid.layers.reshape( + fluid.layers.reduce_sum(inp_data1), [1, 1]), [4, 1]) + b = fluid.layers.expand( + fluid.layers.reshape( + fluid.layers.reduce_sum(inp_data2), [1, 1]), [4, 1]) + cond = fluid.layers.less_than(x=a, y=b) + + ie = fluid.layers.IfElse(cond) + with ie.true_block(): + d1 = ie.input(inp_data1) + d2 = ie.input(inp_data2) + d3 = fluid.layers.elementwise_add(d1, d2) + ie.output(d3) + + with ie.false_block(): + d1 = ie.input(inp_data1) + d2 = ie.input(inp_data2) + d3 = fluid.layers.elementwise_sub(d1, d2) + ie.output(d3) + out = ie() + + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + static_result = exe.run(fluid.default_main_program(), + feed={'inp1': inp1, + 'inp2': inp2}, + fetch_list=out)[0] + self.assertTrue(np.allclose(dygraph_result, static_result)) + def test_rnn(self): np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 2fd82884f4a8d07ced06d5775e5b414ebc2ef654..1abbdf7d6702449ff84768533dfe80b941afe79c 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -595,6 +595,280 @@ class TestLayer(LayerTest): self.assertTrue(np.allclose(static_rlt2, static_rlt)) self.assertTrue(np.allclose(nce_loss3.numpy(), static_rlt)) + def test_conv3d(self): + with self.static_graph(): + images = layers.data( + name='pixel', shape=[3, 6, 6, 6], dtype='float32') + ret = layers.conv3d(input=images, num_filters=3, filter_size=2) + static_ret = self.get_static_graph_result( + feed={'pixel': np.ones( + [2, 3, 6, 6, 6], dtype='float32')}, + fetch_list=[ret])[0] + + with self.static_graph(): + images = layers.data( + name='pixel', shape=[3, 6, 6, 6], dtype='float32') + conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2) + ret = conv3d(images) + static_ret2 = self.get_static_graph_result( + feed={'pixel': np.ones( + [2, 3, 6, 6, 6], dtype='float32')}, + fetch_list=[ret])[0] + + with self.dynamic_graph(): + images = np.ones([2, 3, 6, 6, 6], dtype='float32') + conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2) + dy_ret = conv3d(base.to_variable(images)) + + self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, static_ret2)) + + def test_row_conv(self): + input = np.arange(15).reshape([3, 5]).astype('float32') + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + with self.static_graph(): + x = layers.data( + name='X', + shape=[3, 5], + dtype='float32', + lod_level=1, + append_batch_size=False) + ret = layers.row_conv(input=x, future_context_size=2) + static_ret = self.get_static_graph_result( + feed={ + 'X': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1, 1]], place=place) + }, + fetch_list=[ret], + with_lod=True)[0] + + with self.static_graph(): + x = layers.data( + name='X', + shape=[3, 5], + dtype='float32', + lod_level=1, + append_batch_size=False) + rowConv = nn.RowConv('RowConv', future_context_size=2) + ret = rowConv(x) + static_ret2 = self.get_static_graph_result( + feed={ + 'X': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1, 1]], place=place) + }, + fetch_list=[ret], + with_lod=True)[0] + + # TODO: dygraph can't support LODTensor + + self.assertTrue(np.allclose(static_ret, static_ret2)) + + def test_group_norm(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + shape = (2, 4, 3, 3) + + input = np.random.random(shape).astype('float32') + + with self.static_graph(): + X = fluid.layers.data( + name='X', + shape=shape, + dtype='float32', + lod_level=1, + append_batch_size=False) + ret = layers.group_norm(input=X, groups=2) + static_ret = self.get_static_graph_result( + feed={ + 'X': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1]], place=place) + }, + fetch_list=[ret], + with_lod=True)[0] + + with self.static_graph(): + X = fluid.layers.data( + name='X', + shape=shape, + dtype='float32', + lod_level=1, + append_batch_size=False) + groupNorm = nn.GroupNorm('GroupNorm', groups=2) + ret = groupNorm(X) + static_ret2 = self.get_static_graph_result( + feed={ + 'X': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1]], place=place) + }, + fetch_list=[ret], + with_lod=True)[0] + + with self.dynamic_graph(): + groupNorm = nn.GroupNorm('GroupNorm', groups=2) + dy_ret = groupNorm(base.to_variable(input)) + + self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, static_ret2)) + + def test_spectral_norm(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + + shape = (2, 4, 3, 3) + + input = np.random.random(shape).astype('float32') + + with self.static_graph(): + Weight = fluid.layers.data( + name='Weight', + shape=shape, + dtype='float32', + lod_level=1, + append_batch_size=False) + ret = layers.spectral_norm(weight=Weight, dim=1, power_iters=2) + static_ret = self.get_static_graph_result( + feed={ + 'Weight': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1]], place=place), + }, + fetch_list=[ret], + with_lod=True)[0] + + with self.static_graph(): + Weight = fluid.layers.data( + name='Weight', + shape=shape, + dtype='float32', + lod_level=1, + append_batch_size=False) + spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2) + ret = spectralNorm(Weight) + static_ret2 = self.get_static_graph_result( + feed={ + 'Weight': fluid.create_lod_tensor( + data=input, recursive_seq_lens=[[1, 1]], place=place) + }, + fetch_list=[ret], + with_lod=True)[0] + + with self.dynamic_graph(): + spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2) + dy_ret = spectralNorm(base.to_variable(input)) + + self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, static_ret2)) + + def test_tree_conv(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + adj_array = [1, 2, 1, 3, 1, 4, 1, 5, 2, 6, 2, 7, 2, 8, 4, 9, 4, 10] + adj = np.array(adj_array).reshape((1, 9, 2)).astype('int32') + adj = np.tile(adj, (1, 1, 1)) + vectors = np.random.random((1, 10, 5)).astype('float32') + with self.static_graph(): + NodesVector = fluid.layers.data( + name='NodesVector', + shape=(1, 10, 5), + dtype='float32', + lod_level=1, + append_batch_size=False) + EdgeSet = fluid.layers.data( + name='EdgeSet', + shape=(1, 9, 2), + dtype='int32', + lod_level=1, + append_batch_size=False) + ret = layers.tree_conv( + nodes_vector=NodesVector, + edge_set=EdgeSet, + output_size=6, + num_filters=1, + max_depth=2) + static_ret = self.get_static_graph_result( + feed={ + 'NodesVector': fluid.create_lod_tensor( + data=vectors, recursive_seq_lens=[[1]], place=place), + 'EdgeSet': fluid.create_lod_tensor( + data=adj, recursive_seq_lens=[[1]], place=place) + }, + fetch_list=[ret], + with_lod=False)[0] + + with self.static_graph(): + NodesVector = fluid.layers.data( + name='NodesVector', + shape=(1, 10, 5), + dtype='float32', + lod_level=1, + append_batch_size=False) + EdgeSet = fluid.layers.data( + name='EdgeSet', + shape=(1, 9, 2), + dtype='int32', + lod_level=1, + append_batch_size=False) + treeConv = nn.TreeConv( + 'TreeConv', output_size=6, num_filters=1, max_depth=2) + ret = treeConv(NodesVector, EdgeSet) + static_ret2 = self.get_static_graph_result( + feed={ + 'NodesVector': fluid.create_lod_tensor( + data=vectors, recursive_seq_lens=[[1]], place=place), + 'EdgeSet': fluid.create_lod_tensor( + data=adj, recursive_seq_lens=[[1]], place=place) + }, + fetch_list=[ret], + with_lod=False)[0] + + with self.dynamic_graph(): + treeConv = nn.TreeConv( + 'SpectralNorm', output_size=6, num_filters=1, max_depth=2) + dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj)) + + self.assertTrue(np.allclose(static_ret, static_ret2)) + self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + + def test_conv3d_transpose(self): + input_array = np.arange(0, 48).reshape( + [2, 3, 2, 2, 2]).astype('float32') + + with self.static_graph(): + img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32') + out = layers.conv3d_transpose( + input=img, num_filters=12, filter_size=12, use_cudnn=False) + static_rlt = self.get_static_graph_result( + feed={'pixel': input_array}, fetch_list=[out])[0] + with self.static_graph(): + img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32') + conv3d_transpose = nn.Conv3DTranspose( + 'Conv3DTranspose', + num_filters=12, + filter_size=12, + use_cudnn=False) + out = conv3d_transpose(img) + static_rlt2 = self.get_static_graph_result( + feed={'pixel': input_array}, fetch_list=[out])[0] + with self.dynamic_graph(): + conv3d_transpose = nn.Conv3DTranspose( + 'Conv3DTranspose', + num_filters=12, + filter_size=12, + use_cudnn=False) + dy_rlt = conv3d_transpose(base.to_variable(input_array)) + self.assertTrue(np.allclose(static_rlt2, static_rlt)) + self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt)) + class TestBook(unittest.TestCase): def test_fit_a_line(self): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py index bda8b666dcde22b0e4bacdb5db252267f4c7e34b..645b0188d5f45935ace074ba343de246af476b41 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py @@ -38,7 +38,15 @@ def Lenet(data, class_dim): class TestFetchAndFeed(unittest.TestCase): - def parallel_exe(self, use_cuda, run_parallel_exe, seed=1): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + + def parallel_exe(self, + use_cuda, + run_parallel_exe, + use_experimental_executor=False, + seed=1): main_program = fluid.Program() startup = fluid.Program() startup.random_seed = seed @@ -63,8 +71,12 @@ class TestFetchAndFeed(unittest.TestCase): build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = False build_strategy.memory_optimize = False + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.use_experimental_executor = use_experimental_executor train_cp = compiler.CompiledProgram(main_program).with_data_parallel( - loss_name=loss.name, build_strategy=build_strategy) + loss_name=loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) run_parallel_exe(train_cp, exe, use_cuda, data, label, loss) @@ -131,8 +143,7 @@ class TestFetchAndFeed(unittest.TestCase): if batch_id == 2: break - def test_fetch(self): - os.environ['CPU_NUM'] = str(4) + def test_fetch_with_threaded_executor(self): if core.is_compiled_with_cuda(): self.parallel_exe( use_cuda=True, @@ -140,8 +151,18 @@ class TestFetchAndFeed(unittest.TestCase): self.parallel_exe( use_cuda=False, run_parallel_exe=self.run_parallel_exe_with_fetch) + def test_fetch_with_fast_threaded_executor(self): + if core.is_compiled_with_cuda(): + self.parallel_exe( + use_cuda=True, + run_parallel_exe=self.run_parallel_exe_with_fetch, + use_experimental_executor=True) + self.parallel_exe( + use_cuda=False, + run_parallel_exe=self.run_parallel_exe_with_fetch, + use_experimental_executor=True) + def test_feed(self): - os.environ['CPU_NUM'] = str(4) if core.is_compiled_with_cuda(): self.parallel_exe( use_cuda=True, run_parallel_exe=self.run_parallel_exe_with_feed)