add elementwise op function and add elementwise add/sub kernels test=develop (#2020)

799b00bc · lijianshe02 · GitHub · fb40c748 · 799b00bc · 799b00bc
9 changed file
--- a/lite/fluid/for_range.h
+++ b/lite/fluid/for_range.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "lite/core/context.h"
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+template <lite::TargetType Target>
+struct ForRange {
+  ForRange(const lite::Context<Target>& dev_ctx, size_t limit);
+
+  template <typename Function>
+  void operator()(Function func) const;
+};
+
+template <>
+struct ForRange<lite::TargetType::kX86> {
+  ForRange(lite::X86Context& dev_ctx, size_t limit) : limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    for (size_t i = 0; i < limit_; ++i) {
+      func(i);
+    }
+  }
+
+  size_t limit_;
+};
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
--- a/lite/fluid/hostdevice.h
+++ b/lite/fluid/hostdevice.h
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
--- a/lite/fluid/transform.h
+++ b/lite/fluid/transform.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <type_traits>
+
+#include "lite/core/op_lite.h"
+#include "lite/fluid/hostdevice.h"
+
+namespace paddle {
+namespace lite {
+namespace fluid {
+
+// Transform applys a unary or a binary functor on each element in a
+// range defined by a pair of iterators.
+//
+// - The specialization for CPU calls std::transform.
+// - The specialization for CUDA calls thrust::tranform.
+//
+// NOTE: We need to define InputIter and OutputIter defined as
+//       different types, because the InputIter points op's inputs and
+//       OutputIter pints to op's outputs.
+//
+// NOTE: We don't assume that InputIter to be const InputType* and
+//       OutputIter to be OutputType*, because we might use a iterator
+//       class, paddle::fluid::operators::RowwiseTRansformIterator.
+template <lite::TargetType Target>
+struct Transform {
+  // The unary version.
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const lite::Context<Target>& context,
+                  InputIter first,
+                  InputIter last,
+                  OutputIter result,
+                  UnaryOperation op);
+
+  // The binary version.
+  template <typename InputIter1,
+            typename InputIter2,
+            typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const lite::Context<Target>& context,
+                  InputIter1 first1,
+                  InputIter1 last1,
+                  InputIter2 first2,
+                  OutputIter result,
+                  BinaryOperation op);
+};
+
+template <>
+struct Transform<lite::TargetType::kX86> {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const lite::X86Context& context,
+                  InputIter first,
+                  InputIter last,
+                  OutputIter result,
+                  UnaryOperation op) {
+    std::transform(first, last, result, op);
+  }
+
+  template <typename InputIter1,
+            typename InputIter2,
+            typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const lite::X86Context& context,
+                  InputIter1 first1,
+                  InputIter1 last1,
+                  InputIter2 first2,
+                  OutputIter result,
+                  BinaryOperation op) {
+    std::transform(first1, last1, first2, result, op);
+  }
+};
+
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -33,6 +33,7 @@ add_kernel(concat_compute_x86 X86 basic SRCS concat_compute.cc DEPS ${lite_kerne
 add_kernel(shape_compute_x86 X86 basic SRCS shape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_compute_x86 X86 basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} sequence_pooling)
 add_kernel(softmax_compute_x86 X86 basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
+add_kernel(elementwise_compute_x86 X86 basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps})

 if(NOT LITE_WITH_X86)
    return()
@@ -46,3 +47,4 @@ lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_com
 lite_cc_test(test_sequence_pool_compute_x86 SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_x86)
 lite_cc_test(test_shape_compute_x86 SRCS shape_compute_test.cc DEPS shape_compute_x86)
 lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
+lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86)
--- a/lite/kernels/x86/elementwise_compute.cc
+++ b/lite/kernels/x86/elementwise_compute.cc
@@ -35,21 +35,3 @@ REGISTER_LITE_KERNEL(elementwise_add,
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
    .Finalize();
-
-#ifdef LITE_WITH_X86
-REGISTER_LITE_KERNEL(
-    elementwise_sub_grad,
-    kX86,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::x86::ElementwiseSubGradCompute<float>,
-    def)
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput(paddle::framework::GradVarName("Out"),
-               {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput(paddle::framework::GradVarName("X"),
-                {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput(paddle::framework::GradVarName("Y"),
-                {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
-#endif
--- a/lite/kernels/x86/elementwise_compute.h
+++ b/lite/kernels/x86/elementwise_compute.h
@@ -15,11 +15,8 @@

 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "lite/fluid/eigen.h"
+#include "lite/kernels/x86/elementwise_op_function.h"

 namespace paddle {
 namespace lite {
@@ -45,74 +42,17 @@ class ElementwiseSubCompute
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());

    param.Out->template mutable_data<T>();
-    paddle::operators::ElementwiseComputeEx<SubFunctor<T>,
-                                            platform::CPUDeviceContext,
-                                            T>(*context.x86_execution_context(),
-                                               &param.X->raw_tensor(),
-                                               &param.Y->raw_tensor(),
-                                               param.axis,
-                                               SubFunctor<T>(),
-                                               &param.Out->raw_tensor());
+    paddle::lite::kernels::x86::ElementwiseComputeEx<SubFunctor<T>,
+                                                     lite::TargetType::kX86,
+                                                     T>(
+        context, param.X, param.Y, param.axis, SubFunctor<T>(), param.Out);
  }

  virtual ~ElementwiseSubCompute() = default;
 };

-template <typename T>
-struct SubGradDX {
-  T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename T>
-struct SubGradDY {
-  T operator()(T x, T y, T out, T dout) const { return -dout; }
-};
-
-#ifdef LITE_WITH_X86
-template <typename T>
-class ElementwiseSubGradCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseGradParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-
-    param.X_grad->template mutable_data<T>();
-    // skip out, x, y
-    auto dout = param.Out_grad->raw_tensor();
-    auto dx = param.X_grad->raw_tensor();
-
-    framework::Tensor* dy = nullptr;
-    if (param.Y_grad) {
-      param.Y_grad->template mutable_data<T>();
-      dy = &param.Y_grad->raw_tensor();
-    }
-    auto& skip = dout;
-    paddle::operators::ElemwiseExplicitGradCompute<platform::CPUDeviceContext,
-                                                   T,
-                                                   SubGradDX<T>,
-                                                   SubGradDY<T>>(
-        *context.x86_execution_context(),
-        skip,
-        skip,
-        skip,
-        dout,
-        param.axis,
-        &dx,
-        dy,
-        SubGradDX<T>(),
-        SubGradDY<T>());
-  }
-
-  virtual ~ElementwiseSubGradCompute() = default;
-};
-#endif
-
 template <typename T>
 class ElementwiseAddCompute
    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
@@ -121,16 +61,11 @@ class ElementwiseAddCompute
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
    param.Out->template mutable_data<T>();
-    paddle::operators::ElementwiseComputeEx<AddFunctor<T>,
-                                            platform::CPUDeviceContext,
-                                            T>(*context.x86_execution_context(),
-                                               &param.X->raw_tensor(),
-                                               &param.Y->raw_tensor(),
-                                               param.axis,
-                                               AddFunctor<T>(),
-                                               &param.Out->raw_tensor());
+    paddle::lite::kernels::x86::ElementwiseComputeEx<AddFunctor<T>,
+                                                     lite::TargetType::kX86,
+                                                     T>(
+        context, param.X, param.Y, param.axis, AddFunctor<T>(), param.Out);
  }

  virtual ~ElementwiseAddCompute() = default;

--- a/lite/kernels/x86/elementwise_compute_test.cc
+++ b/lite/kernels/x86/elementwise_compute_test.cc
@@ -74,9 +74,9 @@ TEST(elementwise_add_x86, run_test) {
  elementwise_add.SetContext(std::move(ctx));
  elementwise_add.Run();

-  LOG(INFO) << "output: ";
+  std::vector<float> ref_results = {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
  }
 }


--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
--- a/lite/utils/variant.h
+++ b/lite/utils/variant.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// Because Boost 1.41.0's variadic templates has bug on nvcc, boost
+// will disable variadic template support in NVCC mode.  Define
+// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
+// function symbols.  For details,
+// https://github.com/PaddlePaddle/Paddle/issues/3386
+
+// some platform-independent defintion
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
+
+#if !defined(_WIN32)
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#else
+// there is no equivalent intrinsics in msvc.
+#define UNLIKELY(condition) (condition)
+#endif
+
+#if !defined(_WIN32)
+#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
+#else
+// there is no equivalent intrinsics in msvc.
+#define LIKELY(condition) (condition)
+#endif