Merge pull request #12745 from tensor-tang/refine/op/elewise_mul

Refine elementwise mul cpu forward

Merge pull request #12745 from tensor-tang/refine/op/elewise_mul
Refine elementwise mul cpu forward
d04ef276 · Tao Luo · GitHub · cbc6e6eb · a56142c1 · d04ef276
4 changed file
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once
 #include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"

 namespace paddle {
 namespace operators {
@@ -23,6 +24,37 @@ struct MulFunctor {
  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
 };

+template <typename DeviceContext, typename T>
+void default_elementwise_mul(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y, framework::Tensor* z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        MulFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+  blas.VMUL(x->numel(), x->data<T>(), y->data<T>(),
+            z->mutable_data<T>(ctx.GetPlace()));
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value ||
+    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+}
+
 template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
 public:
@@ -33,9 +65,11 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
    auto* y = ctx.Input<Tensor>("Y");
    auto* z = ctx.Output<Tensor>("Out");
    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          MulFunctor<T>(), z);
+    if (x->numel() == y->numel()) {
+      elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    } else {
+      default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    }
  }
 };


--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -134,6 +134,9 @@ class Blas {
  template <typename T>
  void VADD(int n, const T* x, const T* y, T* z) const;

+  template <typename T>
+  void VMUL(int n, const T* x, const T* y, T* z) const;
+
  template <typename T>
  void VCOPY(int n, const T* x, T* y) const;

@@ -202,6 +205,11 @@ class BlasT : private Blas<DeviceContext> {
    Base()->template VADD<T>(args...);
  }

+  template <typename... ARGS>
+  void VMUL(ARGS... args) const {
+    Base()->template VMUL<T>(args...);
+  }
+
  template <typename... ARGS>
  void VCOPY(ARGS... args) const {
    Base()->template VCOPY<T>(args...);

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -82,6 +82,11 @@ struct CBlas<float> {
  static void VADD(ARGS... args) {
    platform::dynload::vsAdd(args...);
  }
+
+  template <typename... ARGS>
+  static void VMUL(ARGS... args) {
+    platform::dynload::vsMul(args...);
+  }
 };

 template <>
@@ -142,6 +147,11 @@ struct CBlas<double> {
  static void VADD(ARGS... args) {
    platform::dynload::vdAdd(args...);
  }
+
+  template <typename... ARGS>
+  static void VMUL(ARGS... args) {
+    platform::dynload::vdMul(args...);
+  }
 };

 #else
@@ -199,6 +209,7 @@ struct CBlas<platform::float16> {
  static void SMM_GEMM(...) {
    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
  }
+  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
 #ifdef PADDLE_WITH_MKLML
  static void GEMM_BATCH(...) {
    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -374,6 +385,20 @@ void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
 #endif
 }

+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
+                                            T *z) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VMUL(n, x, y, z);
+#else
+  // try to find if openblas support vmul
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+#endif
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,

--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -49,25 +49,27 @@ extern void* mklml_dso_handle;

 #define MKLML_ROUTINE_EACH(__macro) \
  __macro(cblas_sgemm);             \
-  __macro(cblas_saxpy);             \
-  __macro(cblas_scopy);             \
-  __macro(cblas_sgemv);             \
-  __macro(cblas_sgemm_batch);       \
  __macro(cblas_dgemm);             \
+  __macro(cblas_saxpy);             \
  __macro(cblas_daxpy);             \
+  __macro(cblas_scopy);             \
  __macro(cblas_dcopy);             \
+  __macro(cblas_sgemv);             \
  __macro(cblas_dgemv);             \
-  __macro(cblas_dgemm_batch);       \
-  __macro(vsAdd);                   \
-  __macro(vdAdd);                   \
  __macro(cblas_sgemm_alloc);       \
-  __macro(cblas_sgemm_pack);        \
-  __macro(cblas_sgemm_compute);     \
-  __macro(cblas_sgemm_free);        \
  __macro(cblas_dgemm_alloc);       \
+  __macro(cblas_sgemm_pack);        \
  __macro(cblas_dgemm_pack);        \
+  __macro(cblas_sgemm_compute);     \
  __macro(cblas_dgemm_compute);     \
+  __macro(cblas_sgemm_free);        \
  __macro(cblas_dgemm_free);        \
+  __macro(cblas_sgemm_batch);       \
+  __macro(cblas_dgemm_batch);       \
+  __macro(vsAdd);                   \
+  __macro(vdAdd);                   \
+  __macro(vsMul);                   \
+  __macro(vdMul);                   \
  __macro(MKL_Set_Num_Threads)

 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);