Merge pull request #9740 from dzhwinter/memory/activation

"polish activation"

Merge pull request #9740 from dzhwinter/memory/activation
"polish activation"
f7386917 · Yu Yang · GitHub · 8e005407 · ba5ddb7a · f7386917
7 changed file
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -163,7 +163,12 @@ function(op_library TARGET)
    # pybind USE_OP
    if (${pybind_flag} EQUAL 0)
+      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
+      if(${TARGET} STREQUAL "activation")
+        file(APPEND ${pybind_file} "USE_OP(relu);\n")
+      else()
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
+      endif()
    endif()
 endfunction()

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -9,7 +9,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/float16.h"

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <glog/logging.h>
+#include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
@@ -25,6 +28,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+/* Use ugly global variable, for the using in python layer side
+   Please refer to the layer_helper.py and get the details.
+ */
+static std::unordered_set<std::string> InplaceOpSet = {
+    "sigmoid", "exp",        "relu",  "tanh",      "sqrt",         "ceil",
+    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
+};
+static bool IsInplace(std::string op) { return InplaceOpSet.count(op); }
 template <typename DeviceContext, typename Functor>
 class ActivationKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -60,7 +73,6 @@ class ActivationGradKernel
 public:
  using T = typename Functor::ELEMENT_TYPE;
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
    auto* Out = context.Input<framework::Tensor>("Out");
    auto* dOut =
        context.Input<framework::Tensor>(framework::GradVarName("Out"));
@@ -68,7 +80,6 @@ class ActivationGradKernel
    dX->mutable_data<T>(context.GetPlace());
    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto x = framework::EigenVector<T>::Flatten(*X);
    auto out = framework::EigenVector<T>::Flatten(*Out);
    auto dx = framework::EigenVector<T>::Flatten(*dX);
    auto* place =
@@ -78,7 +89,16 @@ class ActivationGradKernel
    for (auto& attr : attrs) {
      *attr.second = context.Attr<float>(attr.first);
    }
-    functor(*place, x, out, dout, dx);
+    bool inplace = functor.Inplace();
+    if (!inplace) {
+      auto* X = context.Input<framework::Tensor>("X");
+      auto x = framework::EigenVector<T>::Flatten(*X);
+      functor(*place, x, out, dout, dx);
+    } else {
+      VLOG(10) << " Inplace activation ";
+      auto x = framework::EigenVector<T>::Flatten(*dX);
+      functor(*place, x, out, dout, dx);
+    }
  }
 };
@@ -89,6 +109,14 @@ struct BaseActivationFunctor {
  using AttrPair = std::vector<std::pair<const char*, float*>>;
  AttrPair GetAttrs() { return AttrPair(); }
+  /* NOTE(*): Output reuse X memory if X is not dependented by its Gradient.
+     For example, sigmoid op's gradient didn't involve x, so its output can
+     reuse
+     input memory. But abs op's gradient use x, it can not be inplaced.
+     gradient did use x.
+   */
+  bool Inplace() const { return false; }
 };
 // sigmoid(x) = 1 / (1 + exp(-x))
@@ -102,6 +130,7 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("sigmoid"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -156,6 +185,7 @@ struct ExpFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct ExpGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("exp"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -174,10 +204,11 @@ struct ReluFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct ReluGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("relu"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>();
+    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
  }
 };
@@ -192,6 +223,7 @@ struct TanhFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct TanhGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("tanh"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -297,6 +329,7 @@ struct SqrtFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct SqrtGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("sqrt"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -316,10 +349,11 @@ struct CeilFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct ZeroGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("ceil"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0) / x;
+    dx.device(d) = static_cast<T>(0) / out;
  }
 };
@@ -432,6 +466,7 @@ struct ReciprocalFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("reciprocal"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -531,12 +566,14 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }
+  bool Inplace() const { return IsInplace("relu6"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
+    dx.device(d) =
-                   ((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
+        dout *
-                       .template cast<T>();
+        ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
+            .template cast<T>();
  }
 };
@@ -611,11 +648,12 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }
+  bool Inplace() const { return IsInplace("soft_relu"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
    auto tmp = static_cast<T>(threshold);
-    auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
+    auto temp = ((out > -tmp) * (out < tmp)).template cast<T>().eval();
    dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
  }
 };
@@ -791,7 +829,7 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"slope", &slope}, {"offset", &offset}};
  }
+  bool Inplace() { return IsInplace("hard_sigmoid"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -33,6 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -461,6 +462,9 @@ All parameter, weight, gradient are variables in Paddle.
        self.back().set_lod(t.lod());
      });
+  m.def("IsInplace",
+        [](std::string op) -> bool { return operators::IsInplace(op); });
  m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -19,6 +19,7 @@ from framework import Variable, Parameter, default_main_program, default_startup
 import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from param_attr import ParamAttr, WeightNormParamAttr
+import core
 class LayerHelper(object):
@@ -398,13 +399,16 @@ class LayerHelper(object):
            return input_var
        if isinstance(act, basestring):
            act = {'type': act}
-        tmp = self.create_tmp_variable(dtype=input_var.dtype)
        if 'use_mkldnn' in self.kwargs:
            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
        act_type = act.pop('type')
        if 'use_mkldnn' in self.kwargs:
            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
+        tmp = input_var
+        # NOTE(dzhwinter): some activation support inplace compution.
+        if not core.IsInplace(act_type):
+            tmp = self.create_tmp_variable(dtype=input_var.dtype)
        self.append_op(
            type=act_type,
            inputs={"X": [input_var]},

--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -361,10 +361,7 @@ class TestCeil(OpTest):
    def test_check_output(self):
        self.check_output()
-    def test_check_grad(self):
+    # The same reason with TestFloor
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
    def init_dtype(self):
        pass
@@ -396,10 +393,8 @@ class TestFloor(OpTest):
    def test_check_output(self):
        self.check_output()
-    def test_check_grad(self):
+    # the gradient on floor, ceil, round is undefined.
-        if self.dtype == np.float16:
+    # we return zero as gradient, but the numpy return nan 
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
    def init_dtype(self):
        pass
@@ -501,11 +496,6 @@ class TestRound(OpTest):
    def test_check_output(self):
        self.check_output()
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
    def init_dtype(self):
        pass