Optimize the elementwise op using eigen (#15494)

* Optimize the elementwise op with CUDA kernels. test=develop * Support setting of attr in op config file. test=develop * Add the support the setting dtype and initializer in config. test=develop * Save workspace. * Add initializer "zeros". test=develop * Fix compiling error. * Support the use of existed file to initailize tensor in op_tester. * Use eigen to optimize the elementwise_add/mul for the case that x and y have the same dims. test=develop

Optimize the elementwise op using eigen (#15494)
* Optimize the elementwise op with CUDA kernels. test=develop * Support setting of attr in op config file. test=develop * Add the support the setting dtype and initializer in config. test=develop * Save workspace. * Add initializer "zeros". test=develop * Fix compiling error. * Support the use of existed file to initailize tensor in op_tester. * Use eigen to optimize the elementwise_add/mul for the case that x and y have the same dims. test=develop
dcda2023 · Yiqun Liu · GitHub · 4624d7c6 · dcda2023 · dcda2023
8 changed file
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -257,7 +257,8 @@ framework::VarDesc *OpTester::Var(const std::string &name) {
 template <typename T>
 void OpTester::SetupTensor(framework::LoDTensor *tensor,
                           const std::vector<int64_t> &shape, T lower, T upper,
-                           const std::string &initializer) {
+                           const std::string &initializer,
+                           const std::string &filename) {
  static unsigned int seed = 100;
  std::mt19937 rng(seed++);
  std::uniform_real_distribution<double> uniform_dist(0, 1);
@@ -280,12 +281,20 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
    }
  } else if (initializer == "natural") {
    for (int i = 0; i < cpu_tensor.numel(); ++i) {
-      cpu_ptr[i] = lower + i;
+      cpu_ptr[i] = static_cast<T>(lower + i);
    }
  } else if (initializer == "zeros") {
    for (int i = 0; i < cpu_tensor.numel(); ++i) {
-      cpu_ptr[i] = 0;
+      cpu_ptr[i] = static_cast<T>(0);
    }
+  } else if (initializer == "file") {
+    std::ifstream is(filename);
+    for (size_t i = 0; i < cpu_tensor.numel(); ++i) {
+      T value;
+      is >> value;
+      cpu_ptr[i] = static_cast<T>(value);
+    }
+    is.close();
  } else {
    PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
  }
@@ -325,15 +334,19 @@ void OpTester::CreateVariables(framework::Scope *scope) {
    auto *tensor = var->GetMutable<framework::LoDTensor>();
    const auto &data_type = var_desc->GetDataType();
    if (data_type == framework::proto::VarType::INT32) {
-      SetupTensor<int>(tensor, shape, 0, 1, item.second.initializer);
+      SetupTensor<int>(tensor, shape, 0, 1, item.second.initializer,
+                       item.second.filename);
    } else if (data_type == framework::proto::VarType::INT64) {
-      SetupTensor<int64_t>(tensor, shape, 0, 1, item.second.initializer);
+      SetupTensor<int64_t>(tensor, shape, 0, 1, item.second.initializer,
+                           item.second.filename);
    } else if (data_type == framework::proto::VarType::FP32) {
      SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
-                         static_cast<float>(1.0), item.second.initializer);
+                         static_cast<float>(1.0), item.second.initializer,
+                         item.second.filename);
    } else if (data_type == framework::proto::VarType::FP64) {
      SetupTensor<double>(tensor, shape, static_cast<double>(0.0),
-                          static_cast<double>(1.0), item.second.initializer);
+                          static_cast<double>(1.0), item.second.initializer,
+                          item.second.filename);
    } else {
      PADDLE_THROW("Unsupported dtype %d.", data_type);
    }

--- a/paddle/fluid/operators/benchmark/op_tester.h
+++ b/paddle/fluid/operators/benchmark/op_tester.h
@@ -55,7 +55,7 @@ class OpTester {
  template <typename T>
  void SetupTensor(framework::LoDTensor *input,
                   const std::vector<int64_t> &shape, T lower, T upper,
-                   const std::string &initializer);
+                   const std::string &initializer, const std::string &filename);
  void RunImpl();

--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -56,6 +56,9 @@ OpInputConfig::OpInputConfig(std::istream& is) {
        ParseDims(is);
      } else if (sep == "lod" || sep == "lod:") {
        ParseLoD(is);
+      } else if (sep == "filename") {
+        is >> filename;
+        EraseEndSep(&filename);
      }
    }
  }
@@ -86,7 +89,7 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
  EraseEndSep(&initializer_str);
  const std::vector<std::string> supported_initializers = {"random", "natural",
-                                                           "zeros"};
+                                                           "zeros", "file"};
  if (!Has(supported_initializers, initializer_str)) {
    PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
  }

--- a/paddle/fluid/operators/benchmark/op_tester_config.h
+++ b/paddle/fluid/operators/benchmark/op_tester_config.h
@@ -35,7 +35,8 @@ struct OpInputConfig {
  std::string name;
  std::string dtype{"fp32"};  // int32/int, int64/long, fp32/float, fp64/double
-  std::string initializer{"random"};  // random, natural
+  std::string initializer{"random"};  // random, natural, zeros, file
+  std::string filename{""};
  std::vector<int64_t> dims;
  std::vector<std::vector<size_t>> lod;
 };

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/platform/float16.h"

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -40,25 +40,26 @@ template <typename DeviceContext, typename T>
 typename std::enable_if<
    std::is_floating_point<T>::value &&
    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add(const framework::ExecutionContext &ctx,
+elementwise_add_same_dims(const framework::ExecutionContext &ctx,
-                const framework::Tensor *x, const framework::Tensor *y,
+                          const framework::Tensor *x,
-                framework::Tensor *z) {
+                          const framework::Tensor *y, framework::Tensor *z) {
-  auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-  auto eigen_y = framework::EigenVector<T>::Flatten(*y);
-  auto eigen_z = framework::EigenVector<T>::Flatten(*z);
  auto blas = math::GetBlas<DeviceContext, T>(ctx);
-  blas.VADD(x->numel(), eigen_x.data(), eigen_y.data(), eigen_z.data());
+  blas.VADD(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
 }
 template <typename DeviceContext, typename T>
 typename std::enable_if<
    !std::is_floating_point<T>::value ||
    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add(const framework::ExecutionContext &ctx,
+elementwise_add_same_dims(const framework::ExecutionContext &ctx,
-                const framework::Tensor *x, const framework::Tensor *y,
+                          const framework::Tensor *x,
-                framework::Tensor *z) {
+                          const framework::Tensor *y, framework::Tensor *z) {
-  default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
+  auto eigen_x = framework::EigenVector<T>::Flatten(*x);
+  auto eigen_y = framework::EigenVector<T>::Flatten(*y);
+  auto eigen_z = framework::EigenVector<T>::Flatten(*z);
+  auto &place = *ctx.template device_context<DeviceContext>().eigen_device();
+  eigen_z.device(place) = eigen_x + eigen_y;
 }
 template <typename DeviceContext, typename T>
@@ -73,7 +74,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
    auto dims_equal = x->dims() == y->dims();
    if (dims_equal) {
-      elementwise_add<DeviceContext, T>(ctx, x, y, z);
+      elementwise_add_same_dims<DeviceContext, T>(ctx, x, y, z);
    } else {
      default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
    }

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 #include "paddle/fluid/platform/float16.h"

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -38,22 +38,26 @@ template <typename DeviceContext, typename T>
 typename std::enable_if<
    std::is_floating_point<T>::value &&
    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_mul(const framework::ExecutionContext& ctx,
+elementwise_mul_same_dims(const framework::ExecutionContext& ctx,
-                const framework::Tensor* x, const framework::Tensor* y,
+                          const framework::Tensor* x,
-                framework::Tensor* z) {
+                          const framework::Tensor* y, framework::Tensor* z) {
  auto blas = math::GetBlas<DeviceContext, T>(ctx);
-  blas.VMUL(x->numel(), x->data<T>(), y->data<T>(),
+  blas.VMUL(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
-            z->mutable_data<T>(ctx.GetPlace()));
 }
 template <typename DeviceContext, typename T>
 typename std::enable_if<
    !std::is_floating_point<T>::value ||
    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_mul(const framework::ExecutionContext& ctx,
+elementwise_mul_same_dims(const framework::ExecutionContext& ctx,
-                const framework::Tensor* x, const framework::Tensor* y,
+                          const framework::Tensor* x,
-                framework::Tensor* z) {
+                          const framework::Tensor* y, framework::Tensor* z) {
-  default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+  auto eigen_x = framework::EigenVector<T>::Flatten(*x);
+  auto eigen_y = framework::EigenVector<T>::Flatten(*y);
+  auto eigen_z = framework::EigenVector<T>::Flatten(*z);
+  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+  eigen_z.device(place) = eigen_x * eigen_y;
 }
 template <typename DeviceContext, typename T>
@@ -88,7 +92,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
    z->mutable_data<T>(ctx.GetPlace());
    if (x.numel() == y->numel()) {
-      elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
+      elementwise_mul_same_dims<DeviceContext, T>(ctx, &x, y, z);
    } else {
      default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
    }