From 9f8f0fc2d307e85045dc45d362bb5672a9b24011 Mon Sep 17 00:00:00 2001 From: Dun Date: Mon, 21 Jan 2019 11:18:51 +0800 Subject: [PATCH] Memory optimization of depthwise conv op and group norm op (#15313) * mem opt * test=develop * test=develop * test=develop * test=develop * test=develop * test=develop * test=develop * refine code test=develop * refine code test=develop * refine code test=develop * refine code test=develop * refine with cub test=develop * fix mkldnn test && remove comments && test=develop * polish code && test=develop * add only_forward test && test=develop --- paddle/fluid/framework/details/CMakeLists.txt | 3 +- .../fluid/framework/details/build_strategy.cc | 10 + .../fluid/framework/details/build_strategy.h | 2 + paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../ir/fuse_relu_depthwise_conv_pass.cc | 159 ++++++++++++ .../ir/fuse_relu_depthwise_conv_pass.h | 42 ++++ paddle/fluid/operators/conv_mkldnn_op.cc | 15 +- paddle/fluid/operators/conv_op.cc | 34 ++- paddle/fluid/operators/conv_op.h | 49 ++-- paddle/fluid/operators/group_norm_op.cc | 33 ++- paddle/fluid/operators/group_norm_op.cu | 186 +++++++------- paddle/fluid/operators/group_norm_op.h | 69 +++--- paddle/fluid/operators/math/depthwise_conv.cu | 233 ++++++++++-------- paddle/fluid/operators/math/depthwise_conv.h | 9 +- paddle/fluid/pybind/pybind.cc | 14 ++ .../paddle/fluid/contrib/memory_usage_calc.py | 2 +- python/paddle/fluid/layers/nn.py | 1 + .../unittests/parallel_executor_test_base.py | 5 +- .../fluid/tests/unittests/test_conv2d_op.py | 86 ++++++- .../test_fuse_relu_depthwise_conv_pass.py | 149 +++++++++++ 20 files changed, 846 insertions(+), 256 deletions(-) create mode 100644 paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc create mode 100644 paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index c1ba6606f10..d5966ad5a97 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -93,5 +93,6 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass - fuse_elewise_add_act_pass multi_batch_merge_pass + fuse_elewise_add_act_pass multi_batch_merge_pass + fuse_relu_depthwise_conv_pass memory_optimize_pass lock_free_optimize_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index df0ff772c9d..756470c5b0b 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -55,6 +55,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } // Add op fusion. + if (strategy.fuse_relu_depthwise_conv_) { + AppendPass("fuse_relu_depthwise_conv_pass"); + } if (strategy.fuse_elewise_add_act_ops_) { auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass"); // Add a graph viz pass to record a graph. @@ -210,6 +213,12 @@ std::unique_ptr BuildStrategy::Apply( pass->Set>( kAllOpDescs, new std::vector(main_program.Block(0).AllOps())); + } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { + if (!use_cuda) { + LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " + "GPU, skipped."; + continue; + } } graph = pass->Apply(std::move(graph)); } @@ -220,6 +229,7 @@ std::unique_ptr BuildStrategy::Apply( } // namespace framework } // namespace paddle +USE_PASS(fuse_relu_depthwise_conv_pass); USE_PASS(fuse_elewise_add_act_pass); USE_PASS(graph_viz_pass); USE_PASS(multi_batch_merge_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 15c2e01b614..603df2e0693 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -74,6 +74,8 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; + bool fuse_relu_depthwise_conv_{false}; + bool memory_optimize_{false}; bool memory_early_delete_{false}; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 84b53212647..b118dccd1b3 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -70,6 +70,7 @@ if(WITH_MKLDNN) endif() cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) +cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector ) set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc new file mode 100644 index 00000000000..0d94008ea82 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc @@ -0,0 +1,159 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr FuseReluDepthwiseConvPass::ApplyImpl( + std::unique_ptr graph) const { + graph = FuseReluDepthwiseConv(std::move(graph), true); + graph = FuseReluDepthwiseConv(std::move(graph), false); + return graph; +} + +std::unique_ptr FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( + std::unique_ptr graph, bool only_forward) const { + PADDLE_ENFORCE(graph.get()); + if (only_forward) + FusePassBase::Init("relu_depthwise_conv_only_forward", graph.get()); + else + FusePassBase::Init("relu_depthwise_conv", graph.get()); + /* + x ---act--> y ---layer-> z + +----------+ + ↓ ↓ + x' <--act'--- y' <-layer'--- z' + + fuse to: + + x ---act-layer-> z + | + ↓ + x' <--act-layer'--- z' + + */ + + GraphPatternDetector gpd; + auto *pattern = gpd.mutable_pattern(); + std::string act_type = "relu"; + std::string layer_type = "depthwise_conv2d"; + auto *x = pattern->NewNode("x")->AsInput(); + auto *y = pattern->NewNode("y")->AsIntermediate(); + auto *z = pattern->NewNode("z")->AsOutput(); + PDNode *xg = nullptr; + PDNode *yg = nullptr; + PDNode *zg = nullptr; + if (!only_forward) { + xg = pattern->NewNode("xg")->AsOutput(); + yg = pattern->NewNode("yg")->AsIntermediate(); + zg = pattern->NewNode("zg")->AsInput(); + } + + PDNode *act_g = nullptr; + PDNode *layer_g = nullptr; + auto *act = pattern->NewNode("act")->assert_is_op(act_type); + auto *layer = pattern->NewNode("layer")->assert_is_op(layer_type); + if (!only_forward) { + act_g = pattern->NewNode("act_g")->assert_is_op(act_type + "_grad"); + layer_g = pattern->NewNode("layer_g")->assert_is_op(layer_type + "_grad"); + } + + act->LinksFrom({x}).LinksTo({y}); + layer->LinksFrom({y}).LinksTo({z}); + if (!only_forward) { + layer_g->LinksFrom({y, zg}).LinksTo({yg}); + act_g->LinksFrom({y, yg}).LinksTo({xg}); + } + + int count = 0; + std::unordered_set need_removed_nodes; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle FuseReluDepthwiseConv fuse"; + // 1. turn on fuse option + auto *layer_op = subgraph.at(layer)->Op(); + layer_op->SetAttr("use_cudnn", false); + layer_op->SetAttr("fuse_relu_before_depthwise_conv", true); + + OpDesc *layer_g_op = nullptr; + if (!only_forward) { + layer_g_op = subgraph.at(layer_g)->Op(); + layer_g_op->SetAttr("use_cudnn", false); + layer_g_op->SetAttr("fuse_relu_before_depthwise_conv", true); + } + // 2. connect x to layer and layer_g, layer_g to xg + auto *y_var = subgraph.at(y)->Var(); + auto *x_var = subgraph.at(x)->Var(); + VarDesc *yg_var = nullptr; + VarDesc *xg_var = nullptr; + if (!only_forward) { + yg_var = subgraph.at(yg)->Var(); + xg_var = subgraph.at(xg)->Var(); + } + + PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1); + PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name()); + layer_op->SetInput("Input", {x_var->Name()}); + subgraph.at(layer)->inputs.push_back(subgraph.at(x)); + subgraph.at(x)->outputs.push_back(subgraph.at(layer)); + VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name(); + + if (!only_forward) { + PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1); + PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name()); + layer_g_op->SetInput("Input", {x_var->Name()}); + subgraph.at(layer_g)->inputs.push_back(subgraph.at(x)); + subgraph.at(x)->outputs.push_back(subgraph.at(layer_g)); + + PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1); + PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0], + yg_var->Name()); + layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()}); + subgraph.at(layer_g)->outputs.push_back(subgraph.at(xg)); + subgraph.at(xg)->inputs.push_back(subgraph.at(layer_g)); + VLOG(4) << "replace " << yg_var->Name() << " -> " << xg_var->Name(); + } + + // 3. delete y, yg, act, act_g + + if (only_forward) { + need_removed_nodes.insert({subgraph.at(y), subgraph.at(act)}); + } else { + need_removed_nodes.insert({subgraph.at(y), subgraph.at(yg), + subgraph.at(act), subgraph.at(act_g)}); + } + count++; + }; + gpd(graph.get(), handler); + GraphSafeRemoveNodes(graph.get(), need_removed_nodes); + AddStatis(count); + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_relu_depthwise_conv_pass, + paddle::framework::ir::FuseReluDepthwiseConvPass); diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h new file mode 100644 index 00000000000..6bd653775e4 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the relu and depthwise conv + */ +class FuseReluDepthwiseConvPass : public FusePassBase { + public: + virtual ~FuseReluDepthwiseConvPass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr FuseReluDepthwiseConv( + std::unique_ptr graph, bool only_forward) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 16ffc11419f..0ce174654e8 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -143,7 +143,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { // Get unique name for storing MKLDNN primitives const std::string key = platform::ConvMKLDNNHandler::GetHash( src_tz, weights_tz, strides, paddings, dilations, groups, - ctx.op().Output("Output")); + ctx.op().Input("Input") + ctx.op().Input("Filter")); const std::string key_conv_pd = key + "@conv_pd"; std::vector pipeline; @@ -371,7 +371,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { platform::ConvMKLDNNHandler::AppendKey( &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt, input->format(), fuse_relu, fuse_residual_conn, - ctx.op().Output("Output")); + ctx.op().Input("Input") + ctx.op().Input("Filter")); const std::string key_conv_pd = key + "@conv_pd"; bool need_s8_to_u8 = false; @@ -798,7 +798,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { const Tensor* input = ctx.Input("Input"); const Tensor* filter = ctx.Input("Filter"); - const Tensor* output = ctx.Input("Output"); const Tensor* output_grad = ctx.Input(framework::GradVarName("Output")); Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); @@ -810,9 +809,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && filter->format() != memory::format::format_undef, "Wrong layout/format set for Filter tensor"); - PADDLE_ENFORCE(output->layout() == DataLayout::kMKLDNN && - output->format() != memory::format::format_undef, - "Wrong layout/format set for Output tensor"); PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN && output_grad->format() != memory::format::format_undef, "Wrong layout/format set for output_grad tensor"); @@ -840,18 +836,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); int g = std::max(groups, 1); GetWeightsTz(weights_tz, g, is_conv3d); - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + std::vector dst_tz = + paddle::framework::vectorize2int(output_grad->dims()); auto src_format = input->format(); mkldnn::memory::format weights_format = GetWeightsFormat(filter->format(), g, is_conv3d); - // Get an unique name from "argument" name of "Output" variable + // Get an unique name from "argument" name of "input" and "Filter" variable // as well as attributes of primitive to be created // This name will be used as key when saving info into device context const std::string key = platform::ConvMKLDNNHandler::GetHash( src_tz, weights_tz, strides, paddings, dilations, groups, - ctx.op().Input("Output")); + ctx.op().Input("Input") + ctx.op().Input("Filter")); const std::string key_conv_pd = key + "@conv_pd"; std::vector pipeline; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index c8b33b8932d..bd788f03e7d 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -171,6 +171,9 @@ void Conv2DOpMaker::Make() { "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") .SetDefault(false); + AddAttr("fuse_relu_before_depthwise_conv", + "(bool, default false) Only used in cuda depthwise kernel") + .SetDefault(false); AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); @@ -412,18 +415,43 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( customized_type_value); } +class Conv2dGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType(GradOpType()); + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetInput("Bias", Input("Bias")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + + op->SetAttrMap(Attrs()); + + return std::unique_ptr(op); + } + + virtual std::string GradOpType() const { + return this->ForwardOpType() + "_grad"; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker, - ops::ConvOpInferVarType, - paddle::framework::DefaultGradOpDescMaker); + ops::ConvOpInferVarType, ops::Conv2dGradMaker); REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad); // depthwise convolution op REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ConvOpInferVarType, ops::Conv2dGradMaker); REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad); REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker, diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index eaa288edc55..797c6651659 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -397,12 +397,18 @@ class DepthwiseConvKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); std::vector dilations = context.Attr>("dilations"); - - math::DepthwiseConvFunctor depthwiseConv; - + bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); auto& dev_ctx = context.template device_context(); - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output); + + if (fuse_relu) { + math::DepthwiseConvFunctor depthwiseConv; + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output); + } else { + math::DepthwiseConvFunctor depthwiseConv; + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output); + } } }; @@ -424,27 +430,42 @@ class DepthwiseConvGradKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); std::vector dilations = context.Attr>("dilations"); + bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); math::SetConstant set_zero; auto& dev_ctx = context.template device_context(); - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - if (input_grad) { input_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, input_grad, static_cast(0)); - depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad); + + if (fuse_relu) { + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, dilations, input_grad); + } else { + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, dilations, input_grad); + } } if (filter_grad) { filter_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, filter_grad, static_cast(0)); - depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, - dilations, filter_grad); + if (fuse_relu) { + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, + paddings, dilations, filter_grad); + } else { + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, + paddings, dilations, filter_grad); + } } } }; diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 4fa15058f86..e18d9841bb8 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/group_norm_op.h" +#include namespace paddle { namespace operators { @@ -102,8 +103,8 @@ class GroupNormGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { // check input - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of GroupNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of GroupNormOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Mean"), "Input(Mean) of GroupNormOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Variance"), @@ -113,7 +114,7 @@ class GroupNormGradOp : public framework::OperatorWithKernel { // check output if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Y")); } if (ctx->HasOutput(framework::GradVarName("Scale"))) { ctx->SetOutputDim(framework::GradVarName("Scale"), @@ -145,12 +146,36 @@ class GroupNormGradOp : public framework::OperatorWithKernel { } }; +class GroupNormGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto *op = new framework::OpDesc(); + op->SetType("group_norm_grad"); + op->SetInput("Scale", Input("Scale")); + op->SetInput("Bias", Input("Bias")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetInput("Y", Output("Y")); + op->SetInput("Mean", Output("Mean")); + op->SetInput("Variance", Output("Variance")); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + + op->SetAttrMap(Attrs()); + + return std::unique_ptr(op); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::GroupNormGradMaker); REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp); REGISTER_OP_CPU_KERNEL( group_norm, ops::GroupNormKernel, diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index 27174630227..6e460c470be 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -12,12 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include "cub/cub.cuh" #include "paddle/fluid/operators/group_norm_op.h" +#include "paddle/fluid/platform/cuda_device_function.h" namespace paddle { namespace operators { +enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 }; + +#define CHECK_CASE(i, flags, kernel_name, args...) \ + if (i == flags) { \ + kernel_name<<>>(args); \ + } + +// 0 for no scale, no bias +// 1 for has scale, no bias +// 2 for no scale, has bias +// 3 for has scale, has bias +#define UNROLL_ALL_CASES(flags, kernel_name, args...) \ + CHECK_CASE(0, flags, kernel_name, args) \ + CHECK_CASE(1, flags, kernel_name, args) \ + CHECK_CASE(2, flags, kernel_name, args) \ + CHECK_CASE(3, flags, kernel_name, args) + +template +__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { + typedef cub::WarpReduce WarpReduce; + typename WarpReduce::TempStorage temp_storage; + value = WarpReduce(temp_storage).Sum(value); + if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value); +} + template __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int imsize, int groups, @@ -36,21 +62,11 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, } x_mean /= number * imsize; x_var /= number * imsize; - __shared__ T s_mem[2]; - if (threadIdx.x == 0) { - s_mem[0] = s_mem[1] = 0; - } - __syncthreads(); - paddle::platform::CudaAtomicAdd(&s_mem[0], x_mean); - paddle::platform::CudaAtomicAdd(&s_mem[1], x_var); - __syncthreads(); - if (threadIdx.x == 0) { - paddle::platform::CudaAtomicAdd(&mean[bid * groups + gid], s_mem[0]); - paddle::platform::CudaAtomicAdd(&var[bid * groups + gid], s_mem[1]); - } + CudaAtomicAddWithWarp(&mean[bid * groups + gid], x_mean); + CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); } -template +template __global__ void GroupNormForward(const T* x, const T* mean, const T* var, const T* scale, const T* bias, int N, int C, int imsize, int groups, int group_size, @@ -68,8 +84,8 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var, for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T val = x[(bid * C + ccid) * imsize + imid]; val = (val - x_mean) * var_inv; - if (scale) val *= scale[gid * group_size + cid]; - if (bias) val += bias[gid * group_size + cid]; + if (flags & kHasScale) val *= scale[gid * group_size + cid]; + if (flags & kHasBias) val += bias[gid * group_size + cid]; y[(bid * C + ccid) * imsize + imid] = val; } } @@ -115,93 +131,87 @@ class GroupNormKernel if (bias) bias_data = bias->data(); int imsize = x_dims[2] * x_dims[3]; - int block_size = std::min(512, imsize); + int block_size = std::min(1024, imsize); dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); GroupNormForwardGetMeanAndVar<<>>( x_data, x_dims[0], x_dims[1], imsize, groups, group_size, mean_data, temp_var_data); - GroupNormForward<<>>( - x_data, mean_data, temp_var_data, scale_data, bias_data, x_dims[0], - x_dims[1], imsize, groups, group_size, epsilon, y_data, var_data); + int flags = + (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; + UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data, + scale_data, bias_data, x_dims[0], x_dims[1], imsize, + groups, group_size, epsilon, y_data, var_data); } }; -template -__global__ void GroupNormBackwardGetMeanAndVar( - const T* x, const T* mean, const T* var, const T* scale, const T* d_y, - int N, int C, int imsize, int groups, int group_size, T epsilon, T* d_x, - T* d_mean, T* d_var, T* d_scale, T* d_bias) { +template +__global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale, + const T* bias, const T* d_y, + int N, int C, int imsize, + int groups, int group_size, + T epsilon, T* d_mean, T* d_var, + T* d_scale, T* d_bias) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; int number = min(group_size, static_cast(C - gid * group_size)); int ccid = gid * group_size + cid; if (ccid >= C) return; - T x_mean = mean[bid * groups + gid]; - T x_var = var[bid * groups + gid]; - T var_inv = 1.0 / sqrt(x_var + epsilon); - T d_var_inv = 0, d_x_mean = 0; + T x_scale = (flags & kHasScale) ? scale[ccid] : 1; + T x_bias = (flags & kHasBias) ? bias[ccid] : 0; + T x_scale_inv = 0; + if (x_scale != 0) x_scale_inv = 1.0 / x_scale; T d_mean_data = 0, d_var_data = 0, d_scale_data = 0, d_bias_data = 0; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { - T tmp = x[(bid * C + ccid) * imsize + imid]; - T val = (tmp - x_mean) * var_inv; + T val = x[(bid * C + ccid) * imsize + imid] - x_bias; T dval = d_y[(bid * C + ccid) * imsize + imid]; - if (d_bias) d_bias_data += dval; - if (d_scale) d_scale_data += val * dval; - if (scale) dval = dval * scale[ccid]; - d_var_data += (tmp - x_mean) * dval; - T d_tmp = dval * var_inv; - if (d_x) d_x[(bid * C + ccid) * imsize + imid] = d_tmp; - d_mean_data -= d_tmp; - } - __shared__ T s_mem[4]; - if (threadIdx.x == 0) { - s_mem[0] = s_mem[1] = 0; - if (d_scale) s_mem[2] = 0; - if (d_bias) s_mem[3] = 0; - } - __syncthreads(); - paddle::platform::CudaAtomicAdd(&s_mem[0], d_mean_data); - paddle::platform::CudaAtomicAdd(&s_mem[1], d_var_data); - if (d_scale) paddle::platform::CudaAtomicAdd(&s_mem[2], d_scale_data); - if (d_bias) paddle::platform::CudaAtomicAdd(&s_mem[3], d_bias_data); - __syncthreads(); - if (threadIdx.x == 0) { - paddle::platform::CudaAtomicAdd(&d_mean[bid * groups + gid], s_mem[0]); - paddle::platform::CudaAtomicAdd(&d_var[bid * groups + gid], s_mem[1]); - if (d_scale) paddle::platform::CudaAtomicAdd(&d_scale[ccid], s_mem[2]); - if (d_bias) paddle::platform::CudaAtomicAdd(&d_bias[ccid], s_mem[3]); + d_var_data += val * dval; + d_mean_data += dval * x_scale; + + val = val * x_scale_inv; + d_bias_data += dval; + d_scale_data += val * dval; } + CudaAtomicAddWithWarp(&d_mean[bid * groups + gid], d_mean_data); + CudaAtomicAddWithWarp(&d_var[bid * groups + gid], d_var_data); + if (flags & kHasScale) CudaAtomicAddWithWarp(&d_scale[ccid], d_scale_data); + if (flags & kHasBias) CudaAtomicAddWithWarp(&d_bias[ccid], d_bias_data); } -template -__global__ void GroupNormBackward(const T* x, const T* mean, const T* var, - const T* d_mean, const T* d_var, int N, int C, - int imsize, int groups, int group_size, - T epsilon, T* d_x) { +template +__global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, + const T* bias, const T* var, const T* d_mean, + const T* d_var, int N, int C, int imsize, + int groups, int group_size, T epsilon, + T* d_x) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; int number = min(group_size, static_cast(C - gid * group_size)); int ccid = gid * group_size + cid; if (ccid >= C) return; - T x_mean = mean[bid * groups + gid]; T x_var = var[bid * groups + gid]; T d_x_mean = d_mean[bid * groups + gid]; - T d_var_inv = d_var[bid * groups + gid]; + T d_x_var = d_var[bid * groups + gid]; + + T x_var_inv = 1.0 / sqrt(x_var + epsilon); + T number_inv = 1.0 / (number * imsize); + + T x_scale = (flags & kHasScale) ? scale[ccid] : 1; + T x_bias = (flags & kHasBias) ? bias[ccid] : 0; + T x_scale_inv = 0; + if (x_scale != 0) x_scale_inv = 1.0 / x_scale; - T d_x_var = - -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv; - d_x_mean -= 2 * d_x_var * x_mean; - d_x_var /= number * imsize; - d_x_mean /= number * imsize; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T tmp = x[(bid * C + ccid) * imsize + imid]; - if (d_x) - d_x[(bid * C + ccid) * imsize + imid] += d_x_mean + tmp * 2 * d_x_var; + T v_y = (tmp - x_bias) * x_scale_inv; + T dly = d_y[(bid * C + ccid) * imsize + imid]; + d_x[(bid * C + ccid) * imsize + imid] = + x_var_inv * + (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); } } @@ -211,10 +221,10 @@ class GroupNormGradKernel public: void Compute(const framework::ExecutionContext& ctx) const override { const float epsilon = ctx.Attr("epsilon"); - auto* x = ctx.Input("X"); - auto* mean = ctx.Input("Mean"); + auto* x = ctx.Input("Y"); auto* var = ctx.Input("Variance"); auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); auto* d_y = ctx.Input(framework::GradVarName("Y")); const auto groups = ctx.Attr("groups"); @@ -226,11 +236,7 @@ class GroupNormGradKernel const auto& x_dims = x->dims(); const int group_size = (x_dims[1] - 1) / groups + 1; - T* d_x_data = nullptr; - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - d_x_data = d_x->data(); - } + d_x->mutable_data(ctx.GetPlace()); math::SetConstant set_zero; auto& dev_ctx = ctx.template device_context(); @@ -245,8 +251,9 @@ class GroupNormGradKernel T* temp_mean_data = temp_mean.data(); auto* x_data = x->data(); + T* d_x_data = nullptr; + if (d_x) d_x_data = d_x->data(); auto* y_data = d_y->data(); - auto* mean_data = mean->data(); auto* var_data = var->data(); T* d_scale_data = nullptr; if (d_scale) { @@ -263,18 +270,25 @@ class GroupNormGradKernel const T* scale_data = nullptr; if (scale) scale_data = scale->data(); + const T* bias_data = nullptr; + if (bias) bias_data = bias->data(); int imsize = x_dims[2] * x_dims[3]; - int block_size = std::min(512, imsize); + int block_size = std::min(1024, imsize); dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); - GroupNormBackwardGetMeanAndVar<<>>( - x_data, mean_data, var_data, scale_data, y_data, x_dims[0], x_dims[1], - imsize, groups, group_size, epsilon, d_x_data, temp_mean_data, - temp_var_data, d_scale_data, d_bias_data); - GroupNormBackward<<>>( - x_data, mean_data, var_data, temp_mean_data, temp_var_data, x_dims[0], - x_dims[1], imsize, groups, group_size, epsilon, d_x_data); + int flags = + (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; + UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data, + bias_data, y_data, x_dims[0], x_dims[1], imsize, groups, + group_size, epsilon, temp_mean_data, temp_var_data, + d_scale_data, d_bias_data); + if (d_x_data != nullptr) { + UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data, + bias_data, var_data, temp_mean_data, temp_var_data, + x_dims[0], x_dims[1], imsize, groups, group_size, + epsilon, d_x_data); + } } }; diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h index 3d6c6a46a96..498e65f6149 100644 --- a/paddle/fluid/operators/group_norm_op.h +++ b/paddle/fluid/operators/group_norm_op.h @@ -96,10 +96,10 @@ class GroupNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const float epsilon = ctx.Attr("epsilon"); - auto* x = ctx.Input("X"); - auto* mean = ctx.Input("Mean"); + auto* x = ctx.Input("Y"); auto* var = ctx.Input("Variance"); auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); auto* d_y = ctx.Input(framework::GradVarName("Y")); const auto groups = ctx.Attr("groups"); @@ -111,19 +111,13 @@ class GroupNormGradKernel : public framework::OpKernel { const auto& x_dims = x->dims(); const int group_size = (x_dims[1] - 1) / groups + 1; - // TODO(liangdun): need to check d_x is null + d_x->mutable_data(ctx.GetPlace()); math::SetConstant set_zero; auto& dev_ctx = ctx.template device_context(); - T* d_x_data = nullptr; - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, d_x, static_cast(0)); - d_x_data = d_x->data(); - } auto* x_data = x->data(); + auto* d_x_data = d_x->data(); auto* y_data = d_y->data(); - auto* mean_data = mean->data(); auto* var_data = var->data(); T* d_scale_data = nullptr; if (d_scale) { @@ -140,6 +134,8 @@ class GroupNormGradKernel : public framework::OpKernel { const T* scale_data = nullptr; if (scale) scale_data = scale->data(); + const T* bias_data = nullptr; + if (bias) bias_data = bias->data(); int imsize = x_dims[2] * x_dims[3]; auto* iter_x_data = x_data; @@ -147,46 +143,45 @@ class GroupNormGradKernel : public framework::OpKernel { auto* iter_y_data = y_data; for (int bid = 0; bid < x_dims[0]; bid++) for (int gid = 0; gid < groups; gid++) { - T x_mean = mean_data[bid * groups + gid]; T x_var = var_data[bid * groups + gid]; T var_inv = 1.0 / sqrt(x_var + epsilon); int number = std::min(group_size, static_cast(x_dims[1] - gid * group_size)); - auto* tmp = iter_x_data; - auto* tmp2 = iter_d_x_data; - T d_var_inv = 0, d_x_mean = 0; + T number_inv = 1.0 / (number * imsize); + auto* iter_x_data2 = iter_x_data; + auto* iter_y_data2 = iter_y_data; + T dp_scale = 0, dp_bias = 0; for (int cid = 0; cid < number; cid++) { for (int imid = 0; imid < imsize; - imid++, tmp++, iter_y_data++, iter_d_x_data++) { - T val = (tmp[0] - x_mean) * var_inv; + imid++, iter_x_data++, iter_y_data++) { + T val = iter_x_data[0]; + if (bias_data) val -= bias_data[gid * group_size + cid]; T dval = iter_y_data[0]; + dp_scale += val * dval; + dp_bias += dval * scale_data[gid * group_size + cid]; + + if (scale_data && scale_data[gid * group_size + cid] != 0) + val /= scale_data[gid * group_size + cid]; if (d_bias_data) d_bias_data[gid * group_size + cid] += dval; if (d_scale_data) d_scale_data[gid * group_size + cid] += val * dval; - if (scale_data) dval = scale_data[gid * group_size + cid] * dval; - - d_var_inv += (tmp[0] - x_mean) * dval; - T d_tmp = dval * var_inv; - if (d_x_data) iter_d_x_data[0] += d_tmp; - d_x_mean -= d_tmp; } } - T d_x_var = - -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv; - d_x_mean -= 2 * d_x_var * x_mean; - d_x_var /= number * imsize; - d_x_mean /= number * imsize; - - iter_d_x_data = tmp2; - - if (d_x_data) { - for (int cid = 0; cid < number; cid++) { - for (int imid = 0; imid < imsize; - imid++, iter_x_data++, iter_d_x_data++) { - iter_d_x_data[0] += d_x_mean; - iter_d_x_data[0] += iter_x_data[0] * 2 * d_x_var; - } + for (int cid = 0; cid < number; cid++) { + for (int imid = 0; imid < imsize; + imid++, iter_d_x_data++, iter_x_data2++, iter_y_data2++) { + T v_y = iter_x_data2[0]; + T dly = iter_y_data2[0]; + T dss = dp_scale; + T dbs = dp_bias; + T v_scale = scale_data[gid * group_size + cid]; + T v_bias = bias_data[gid * group_size + cid]; + v_y -= v_bias; + if (v_scale != 0) v_y /= v_scale; + iter_d_x_data[0] = + (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) * + var_inv; } } } diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 66d37c3bf31..240cec14dc2 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -14,7 +14,9 @@ limitations under the License. */ #include #include +#include "cub/cub.cuh" #include "paddle/fluid/operators/math/depthwise_conv.h" +#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { @@ -22,28 +24,11 @@ namespace operators { namespace math { template -__inline__ __device__ T warpReduceSum(T val) { -#if CUDA_VERSION < 9000 - for (int offset = 16; offset > 0; offset /= 2) - val += __shfl_down(val, offset); - return val; -#else -#define FULL_MASK 0xffffffff - for (int offset = 16; offset > 0; offset /= 2) - val += __shfl_down_sync(FULL_MASK, val, offset); - return val; -#endif -} -__forceinline__ __device__ unsigned lane_id() { - unsigned ret; - asm volatile("mov.u32 %0, %laneid;" : "=r"(ret)); - return ret; -} - -__forceinline__ __device__ unsigned warp_id() { - unsigned ret; - asm volatile("mov.u32 %0, %warpid;" : "=r"(ret)); - return ret; +__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { + typedef cub::WarpReduce WarpReduce; + typename WarpReduce::TempStorage temp_storage; + value = WarpReduce(temp_storage).Sum(value); + if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value); } #define ARG_DEFINE_KernelDepthwiseConv \ @@ -58,7 +43,7 @@ __forceinline__ __device__ unsigned warp_id() { // A Cuda kernel to compute the depthwise convolution forward pass // in NCHW format. -template +template __device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) { for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) { for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { @@ -87,7 +72,11 @@ __device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) { if (h_in >= h_start && h_in < h_end && w_in >= w_start && w_in < w_end) { const int offset = in_offset + h_in * input_width + w_in; - value += weight[weight_offset] * input_data[offset]; + if (fuse_relu_before_conv) { + value += weight[weight_offset] * max(0.0f, input_data[offset]); + } else { + value += weight[weight_offset] * input_data[offset]; + } } weight_offset++; } @@ -100,7 +89,7 @@ __device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) { } } -template +template __device__ __inline__ void KernelDepthwiseConvCFilter( ARG_DEFINE_KernelDepthwiseConv) { const int kWeghtSize = c_filter * c_filter; @@ -137,7 +126,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilter( if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) { const int offset = in_offset + h_in * input_width + w_in; - value += r_weight[h_f * c_filter + w_f] * input_data[offset]; + if (fuse_relu_before_conv) { + value += r_weight[h_f * c_filter + w_f] * + max(0.0f, input_data[offset]); + } else { + value += r_weight[h_f * c_filter + w_f] * input_data[offset]; + } } } } @@ -149,18 +143,19 @@ __device__ __inline__ void KernelDepthwiseConvCFilter( } } -template +template __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { if (c_filter_multiplier == 0) { if (c_filter == -1) - KernelDepthwiseConv( + KernelDepthwiseConv( input_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, filter_multiplier, filter_height, filter_width, stride_height, stride_width, padding_height, padding_width, dilate_height, dilate_width, output_data); else - KernelDepthwiseConvCFilter( + KernelDepthwiseConvCFilter( input_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, filter_multiplier, filter_height, filter_width, stride_height, @@ -168,14 +163,14 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { dilate_width, output_data); } else { if (c_filter == -1) - KernelDepthwiseConv(input_data, filter_data, batch_size, - output_channels, output_height, output_width, - input_channels, input_height, input_width, - c_filter_multiplier, filter_height, filter_height, - c_stride, c_stride, padding_height, padding_width, - dilate_height, dilate_width, output_data); + KernelDepthwiseConv( + input_data, filter_data, batch_size, output_channels, output_height, + output_width, input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_height, c_stride, c_stride, + padding_height, padding_width, dilate_height, dilate_width, + output_data); else - KernelDepthwiseConvCFilter( + KernelDepthwiseConvCFilter( input_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, c_filter_multiplier, filter_height, filter_height, c_stride, c_stride, @@ -186,17 +181,18 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { // CUDA kernel to compute the depthwise convolution backprop w.r.t input. #define ARG_DEFINE_KernelDepthwiseConvInputGrad \ - const T *const output_grad_data, const T *const filter_data, \ - const int batch_size, const int output_channels, \ - const int output_height, const int output_width, \ - const int input_channels, const int input_height, const int input_width, \ + const T *const input_data, const T *const output_grad_data, \ + const T *const filter_data, const int batch_size, \ + const int output_channels, const int output_height, \ + const int output_width, const int input_channels, \ + const int input_height, const int input_width, \ const int filter_multiplier, const int filter_height, \ const int filter_width, const int stride_height, const int stride_width, \ const int padding_height, const int padding_width, \ const int dilate_height, const int dilate_width, \ T *const input_grad_data -template +template __device__ __inline__ void KernelDepthwiseConvInputGrad( ARG_DEFINE_KernelDepthwiseConvInputGrad) { for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { @@ -217,6 +213,15 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad( int w_out_end = w_in + padding_width; T value = 0; + int index = + ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + + w_in; + if (fuse_relu_before_conv) { + if (input_data[index] <= 0) { + input_grad_data[index] = 0; + continue; + } + } for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; c_out++) { @@ -242,15 +247,13 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad( } } } - int index = - ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + - w_in; input_grad_data[index] = value; } } } -template +template __device__ __inline__ void KernelDepthwiseConvInputGradCFilter( ARG_DEFINE_KernelDepthwiseConvInputGrad) { const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1; @@ -276,6 +279,15 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter( int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width; T value = 0; + int index = + ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + + w_in; + if (fuse_relu_before_conv) { + if (input_data[index] <= 0) { + input_grad_data[index] = 0; + continue; + } + } for (int c_i = 0; c_i < filter_multiplier; c_i++) { int c_out = c_in * filter_multiplier + c_i; @@ -300,34 +312,33 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter( } } } - int index = - ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + - w_in; input_grad_data[index] = value; } } } -template +template __global__ void KernelDepthwiseConvInputGradSp( ARG_DEFINE_KernelDepthwiseConvInputGrad) { if (c_filter_multiplier == 0) - KernelDepthwiseConvInputGrad( - output_grad_data, filter_data, batch_size, output_channels, + KernelDepthwiseConvInputGrad( + input_data, output_grad_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, filter_multiplier, filter_height, filter_width, stride_height, stride_width, padding_height, padding_width, dilate_height, dilate_width, input_grad_data); else if (c_filter == -1) - KernelDepthwiseConvInputGrad( - output_grad_data, filter_data, batch_size, output_channels, + KernelDepthwiseConvInputGrad( + input_data, output_grad_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, padding_height, padding_width, dilate_height, dilate_width, input_grad_data); else - KernelDepthwiseConvInputGradCFilter( - output_grad_data, filter_data, batch_size, output_channels, + KernelDepthwiseConvInputGradCFilter( + input_data, output_grad_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, padding_height, padding_width, dilate_height, dilate_width, @@ -335,7 +346,7 @@ __global__ void KernelDepthwiseConvInputGradSp( } // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. -template +template __device__ __inline__ void KernelDepthwiseConvFilterGrad( const T* output_grad_data, const T* input_data, const int num, const int output_channels, const int output_height, const int output_width, @@ -347,7 +358,6 @@ __device__ __inline__ void KernelDepthwiseConvFilterGrad( T s = 0; int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; - int lid = lane_id(); for (int image_w = threadIdx.x; image_w < output_width; image_w += blockDim.x) { @@ -364,28 +374,28 @@ __device__ __inline__ void KernelDepthwiseConvFilterGrad( if (image_wk < 0 || image_wk >= input_width) continue; #define gaid(N, C, H, W) \ ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W)) - - s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * - input_data[((bid * (gridDim.z / filter_multiplier) + - kernel_id / filter_multiplier) * - input_height + - image_hk) * - input_width + - image_wk]; + int input_id = ((bid * (gridDim.z / filter_multiplier) + + kernel_id / filter_multiplier) * + input_height + + image_hk) * + input_width + + image_wk; + if (fuse_relu_before_conv) { + s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * + max(0.0f, input_data[input_id]); + } else { + s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * + input_data[input_id]; + } #undef gaid } } } -#if __CUDA_ARCH__ >= 530 - s = warpReduceSum(s); - if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); -#else - paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); -#endif + CudaAtomicAddWithWarp(&filter_grad_data[gbid], s); } -template +template __global__ void KernelDepthwiseConvFilterGradSp( const T* output_grad_data, const T* input_data, const int num, const int output_channels, const int output_height, const int output_width, @@ -395,14 +405,14 @@ __global__ void KernelDepthwiseConvFilterGradSp( const int padding_height, const int padding_width, const int dilate_height, const int dilate_width, T* filter_grad_data) { if (c_filter_multiplier == 0) - KernelDepthwiseConvFilterGrad( + KernelDepthwiseConvFilterGrad( output_grad_data, input_data, num, output_channels, output_height, output_width, input_channels, input_height, input_width, filter_multiplier, filter_height, filter_width, stride_height, stride_width, padding_height, padding_width, dilate_height, dilate_width, filter_grad_data); else - KernelDepthwiseConvFilterGrad( + KernelDepthwiseConvFilterGrad( output_grad_data, input_data, num, output_channels, output_height, output_width, input_channels, input_height, input_width, c_filter_multiplier, filter_height, filter_width, stride_height, @@ -415,8 +425,9 @@ __global__ void KernelDepthwiseConvFilterGradSp( * Ksize, strides, paddings are two elements. These two elements represent * height and width, respectively. */ -template -class DepthwiseConvFunctor { +template +class DepthwiseConvFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, @@ -446,6 +457,10 @@ class DepthwiseConvFunctor { T* output_data = output->mutable_data(context.GetPlace()); int thread = 512; + if (output_width > 1024 && output_width <= 2048) + thread = (output_width - 1) / 2 + 1; + else if (output_width > 512 && output_width <= 1024) + thread = output_width; int blocks = std::min(std::max(thread / output_width, 1), output_height); dim3 threads(std::min(output_width, thread), blocks, 1); dim3 grid(output_channels, batch_size, 1); @@ -456,8 +471,9 @@ class DepthwiseConvFunctor { stride_height == stride_width && stride_height == c_stride && \ (ksize_height == ksize_width && ksize_height == c_filter || \ c_filter == -1)) { \ - KernelDepthwiseConvSp<<>>( \ + KernelDepthwiseConvSp< \ + T, c_filter_multiplier, c_stride, c_filter, \ + fuse_relu_before_conv><<>>( \ input_data, filter_data, batch_size, output_channels, output_height, \ output_width, input_channels, input_height, input_width, \ filter_multiplier, ksize_height, ksize_width, stride_height, \ @@ -480,8 +496,9 @@ class DepthwiseConvFunctor { } }; -template -class DepthwiseConvInputGradFunctor { +template +class DepthwiseConvInputGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, @@ -507,11 +524,16 @@ class DepthwiseConvInputGradFunctor { const int dilate_height = dilations[0]; const int dilate_width = dilations[1]; + const T* input_data = input.data(); const T* filter_data = filter.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); int thread = 512; + if (input_width > 1024 && input_width <= 2048) + thread = (input_width - 1) / 2 + 1; + else if (input_width > 512 && input_width <= 1024) + thread = input_width; int blocks = std::min(std::max(thread / input_width, 1), input_height); dim3 threads(std::min(input_width, thread), blocks, 1); dim3 grid(input_channels, batch_size, 1); @@ -524,13 +546,13 @@ class DepthwiseConvInputGradFunctor { (ksize_height == ksize_width && ksize_height == c_filter || \ c_filter == -1)) { \ KernelDepthwiseConvInputGradSp< \ - T, c_filter_multiplier, c_stride, \ - c_filter><<>>( \ - output_grad_data, filter_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, input_grad_data); \ + T, c_filter_multiplier, c_stride, c_filter, \ + fuse_relu_before_conv><<>>( \ + input_data, output_grad_data, filter_data, batch_size, \ + output_channels, output_height, output_width, input_channels, \ + input_height, input_width, filter_multiplier, ksize_height, \ + ksize_width, stride_height, stride_width, padding_height, \ + padding_width, dilate_height, dilate_width, input_grad_data); \ return; \ } check_case(1, 1, 3); @@ -552,8 +574,9 @@ class DepthwiseConvInputGradFunctor { } }; -template -class DepthwiseConvFilterGradFunctor { +template +class DepthwiseConvFilterGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, @@ -583,6 +606,10 @@ class DepthwiseConvFilterGradFunctor { T* filter_grad_data = filter_grad->mutable_data(context.GetPlace()); int block_size = 512; + if (output_width > 1024 && output_width <= 2048) + block_size = (output_width - 1) / 2 + 1; + else if (output_width > 512 && output_width <= 1024) + block_size = output_width; int crop_output_height = std::min(std::max(block_size / output_width, 1), output_height); dim3 grid(ksize_width, ksize_height, output_channels); @@ -592,7 +619,8 @@ class DepthwiseConvFilterGradFunctor { #define check_case(c_filter_multiplier) \ if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \ KernelDepthwiseConvFilterGradSp< \ - T, c_filter_multiplier><<>>( \ + T, c_filter_multiplier, \ + fuse_relu_before_conv><<>>( \ output_grad_data, input_data, batch_size, output_channels, \ output_height, output_width, input_channels, input_height, \ input_width, filter_multiplier, ksize_height, ksize_width, \ @@ -606,18 +634,31 @@ class DepthwiseConvFilterGradFunctor { } }; -template class DepthwiseConvFunctor; -template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; +template class DepthwiseConvInputGradFunctor; template class DepthwiseConvInputGradFunctor; + double, false>; + +template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; + +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; + +template class DepthwiseConvInputGradFunctor; template class DepthwiseConvInputGradFunctor; + double, true>; template class DepthwiseConvFilterGradFunctor; + float, true>; template class DepthwiseConvFilterGradFunctor; + double, true>; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h index 71f6fcb23df..56648e4125b 100644 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ b/paddle/fluid/operators/math/depthwise_conv.h @@ -26,7 +26,8 @@ namespace math { * \brief Compute the depthwise convolution which include * forward process and backpropagation process */ -template +template class DepthwiseConvFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, @@ -36,7 +37,8 @@ class DepthwiseConvFunctor { const std::vector& dilations, framework::Tensor* output); }; -template +template class DepthwiseConvInputGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, @@ -48,7 +50,8 @@ class DepthwiseConvInputGradFunctor { framework::Tensor* input_grad); }; -template +template class DepthwiseConvFilterGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 96fa428ee36..96d0d16bf78 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1023,6 +1023,20 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default False)DOC") + .def_property( + "fuse_relu_depthwise_conv", + [](const BuildStrategy &self) { + return self.fuse_relu_depthwise_conv_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + self.fuse_relu_depthwise_conv_ = b; + }, + R"DOC(The type is BOOL, fuse_relu_depthwise_conv indicate whether + to fuse relu and depthwise_conv2d, + it will save GPU memory and may make the execution faster. + This options is only available in GPU devices. + Default False)DOC") .def_property( "memory_optimize", [](const BuildStrategy &self) { return self.memory_optimize_; }, diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py index baa14a573fc..1f7ec69dd75 100644 --- a/python/paddle/fluid/contrib/memory_usage_calc.py +++ b/python/paddle/fluid/contrib/memory_usage_calc.py @@ -76,7 +76,7 @@ def memory_usage(program, batch_size): # Get the var_name list of first block and calculate total_memory = 0.0 - processed_var_names = set() + processed_var_names = set(["@EMPTY@"]) for op in program.global_block().ops: for var_name in op.output_arg_names: if var_name in processed_var_names: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ea88d8b4d09..503c91c27ba 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1972,6 +1972,7 @@ def conv2d(input, 'groups': groups, 'use_cudnn': use_cudnn, 'use_mkldnn': False, + 'fuse_relu_before_depthwise_conv': False }) pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 1ba47d5a576..fdacd241f9e 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -42,6 +42,7 @@ class TestParallelExecutorBase(unittest.TestCase): use_reduce=False, use_ir_memory_optimize=False, fuse_elewise_add_act_ops=False, + fuse_relu_depthwise_conv=False, optimizer=fluid.optimizer.Adam, use_fast_executor=False, enable_sequential_execution=False): @@ -60,7 +61,8 @@ class TestParallelExecutorBase(unittest.TestCase): loss = method(use_feed=feed_dict is not None) - optimizer().minimize(loss) + if optimizer: + optimizer().minimize(loss) if memory_opt: fluid.memory_optimize(main) @@ -76,6 +78,7 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops + build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv build_strategy.memory_optimize = use_ir_memory_optimize build_strategy.enable_sequential_execution = enable_sequential_execution if use_cuda and core.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 25a9e8d46ed..2927a9828fd 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -70,6 +70,7 @@ class TestConv2dOp(OpTest): self.exhaustive_search = False self.use_cuda = False self.use_mkldnn = False + self.fuse_relu_before_depthwise_conv = False self.data_format = "AnyLayout" self.dtype = np.float32 self.init_kernel_type() @@ -84,8 +85,17 @@ class TestConv2dOp(OpTest): } input = np.random.random(self.input_size).astype(self.dtype) + if not self.testcuda(): + self.fuse_relu_before_depthwise_conv = False + if self.fuse_relu_before_depthwise_conv: + input = input - 0.5 + input -= (input < 0) * 0.1 + input += (input >= 0) * 0.1 + input2 = np.maximum(input, 0.0) + else: + input2 = input filter = np.random.random(self.filter_size).astype(self.dtype) - output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups, + output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups, conv2d_param) output = output.astype(self.dtype) @@ -101,6 +111,8 @@ class TestConv2dOp(OpTest): 'use_cudnn': self.use_cudnn, 'use_mkldnn': self.use_mkldnn, 'data_format': self.data_format, + 'fuse_relu_before_depthwise_conv': + self.fuse_relu_before_depthwise_conv, 'exhaustive_search': self.exhaustive_search } self.outputs = {'Output': output} @@ -364,6 +376,78 @@ class TestDepthwiseConvWithDilation2(TestConv2dOp): self.op_type = "depthwise_conv2d" +class TestDepthwiseConvandFuse(TestConv2dOp): + def init_test_case(self): + self.fuse_relu_before_depthwise_conv = True + self.use_cuda = True + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [3, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConv2andFuse(TestConv2dOp): + def init_test_case(self): + self.fuse_relu_before_depthwise_conv = True + self.use_cuda = True + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [3, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConv3andFuse(TestConv2dOp): + def init_test_case(self): + self.fuse_relu_before_depthwise_conv = True + self.use_cuda = True + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConvWithDilationandFuse(TestConv2dOp): + def init_test_case(self): + self.fuse_relu_before_depthwise_conv = True + self.use_cuda = True + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + self.dilations = [2, 2] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConvWithDilation2andFuse(TestConv2dOp): + def init_test_case(self): + self.fuse_relu_before_depthwise_conv = True + self.use_cuda = True + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + self.dilations = [2, 2] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + class TestCUDNNExhaustiveSearch(TestConv2dOp): def init_kernel_type(self): self.use_cudnn = True diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py new file mode 100644 index 00000000000..0c8531606b8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py @@ -0,0 +1,149 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest +import os + +MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" + + +def norm(*args, **kargs): + return fluid.layers.batch_norm(*args, **kargs) + + +def sep_conv(input, channel, stride, filter, dilation=1, act=None): + # with scope('depthwise'): + input = fluid.layers.conv2d( + input, + input.shape[1], + filter, + stride, + groups=input.shape[1], + padding=(filter // 2) * dilation, + dilation=dilation, + use_cudnn=False, + bias_attr=False) + input = norm(input) + if act: input = act(input) + # with scope('pointwise'): + input = fluid.layers.conv2d( + input, channel, 1, 1, groups=1, padding=0, bias_attr=False) + input = norm(input) + if act: input = act(input) + return input + + +def simple_depthwise_net(use_feed): + if use_feed: + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + else: + reader = fluid.layers.open_files( + filenames=[MNIST_RECORDIO_FILE], + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + reader = fluid.layers.io.double_buffer(reader) + img, label = fluid.layers.read_file(reader) + hidden = fluid.layers.reshape(img, (-1, 1, 28, 28)) + for _ in range(4): + hidden = sep_conv(hidden, channel=200, stride=2, filter=5) + hidden = fluid.layers.relu(hidden) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestMNIST(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + # Convert mnist to recordio file + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(mnist.train(), batch_size=4) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + MNIST_RECORDIO_FILE, reader, feeder) + + def _init_data(self, random=True): + np.random.seed(5) + if random: + img = np.random.random(size=[32, 784]).astype(np.float32) + else: + img = np.ones(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + def _compare(self, model, use_cuda, random_data=True, only_forward=False): + if use_cuda and not core.is_compiled_with_cuda(): + return + img, label = self._init_data(random_data) + + def _optimizer(learning_rate=1e-6): + optimizer = fluid.optimizer.SGD( + learning_rate=learning_rate, + regularization=fluid.regularizer.L2Decay(1e-6)) + return optimizer + + if only_forward: + _optimizer = None + + fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_relu_depthwise_conv=True, + use_ir_memory_optimize=True, + memory_opt=False, + optimizer=_optimizer) + not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_relu_depthwise_conv=False, + memory_opt=False, + optimizer=_optimizer) + + for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + + def test_simple_depthwise_with_fuse_op(self): + self._compare(simple_depthwise_net, True) + self._compare(simple_depthwise_net, False) + + def test_simple_depthwise_with_fuse_op_only_forward(self): + self._compare(simple_depthwise_net, True, only_forward=True) + self._compare(simple_depthwise_net, False, only_forward=True) + + +if __name__ == '__main__': + unittest.main() -- GitLab