MKLDNN elementwise_mul: simple xbyak version for AVX512

2d73ad18 · Tomasz Patejko · Michal Gallus · 213ec37d · 2d73ad18
隐藏空白更改
内联并排

Showing with 27 addition and 3 deletion

paddle/fluid/operators/elementwise_mul_mkldnn_op.cc paddle/fluid/operators/elementwise_mul_mkldnn_op.cc +27 -3

未找到文件。
--- a/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_mkldnn_op.cc
@@ -17,11 +17,29 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
 namespace paddle {
 namespace operators {
 using framework::DataLayout;
+struct vector_mul : public Xbyak::CodeGenerator {
+  vector_mul() {
+    // RDI is ptr X
+    // RSI is ptr Y
+    // RDX is ptr Z
+    vmovups(zmm2, ptr[rdi]);
+    vmovups(zmm3, ptr[rsi]);
+    vmulps(zmm1, zmm2, zmm3);
+    vmovups(ptr[rdx], zmm1);
+    ret();
+  }
+};
 template <typename T>
 class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
 public:
@@ -61,6 +79,14 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
        constexpr int simd_width = 16;
        int C = c / simd_width;
+        vector_mul mul;
+        using mul_func_t = void (*)(const float*, const float*, float*);
+        mul_func_t mul_func = (mul_func_t)mul.getCode();
+        auto ptr_x = x_data;
        for (int ni = 0; ni < n; ni++) {
          for (int ci = 0; ci < C; ci++) {
            for (int hi = 0; hi < h; hi++) {
@@ -74,9 +100,7 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
                             ci * h * w * simd_width + hi * w * simd_width +
                             wi * simd_width;
-                for (int i = 0; i < simd_width; i++) {
+                mul_func(ptr_x, ptr_y, ptr_z);
-                  ptr_z[i] = ptr_x[i] * ptr_y[i];
-                }
              }
            }
          }