refine jit vmul with all size

9255119f · tensor-tang · a9c18241 · 9255119f · 9255119f
隐藏空白更改
内联并排

Showing with 10 addition and 12 deletion

paddle/fluid/operators/math/jit_code.cc paddle/fluid/operators/math/jit_code.cc +10 -11

paddle/fluid/operators/math/jit_code.h paddle/fluid/operators/math/jit_code.h +0 -1

未找到文件。
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -27,11 +27,7 @@ using namespace platform::jit;  // NOLINT
 bool VMulJitCode::init(int d) {
  // It's not necessary to use avx512 since it would slow down the frequency
  // and this kernel is not compute bound.
-  if (MayIUse(avx)) {
+  return MayIUse(avx);
-    return d % 2 == 0;
-  } else {
-    return false;
-  }
 }
 void VMulJitCode::generate() {
@@ -54,16 +50,19 @@ void VMulJitCode::generate() {
    rest -= 4;
  }
  if (rest >= 2) {
-    mov(tmp, qword[param1 + offset]);
+    vmovq(xmm_src1, ptr[param1 + offset]);
-    vmovq(xmm_src1, tmp);
+    vmovq(xmm_src2, ptr[param2 + offset]);
-    mov(tmp, qword[param2 + offset]);
-    vmovq(xmm_src2, tmp);
    vmulps(xmm_dst, xmm_src1, xmm_src2);
-    vmovq(tmp, xmm_dst);
+    vmovq(ptr[param3 + offset], xmm_dst);
-    mov(ptr[param3 + offset], tmp);
    offset += sizeof(float) * 2;
    rest -= 2;
  }
+  if (rest > 0) {
+    vmovss(xmm_src1, ptr[param1 + offset]);
+    vmovss(xmm_src2, ptr[param2 + offset]);
+    vmulss(xmm_dst, xmm_src1, xmm_src2);
+    vmovss(ptr[param3 + offset], xmm_dst);
+  }
  ret();
 }

--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -43,7 +43,6 @@ class VMulJitCode : public JitCode {
  reg64_t param1{abi_param1};
  reg64_t param2{abi_param2};
  reg64_t param3{abi_param3};
-  reg64_t tmp = rax;
  xmm_t xmm_src1 = xmm_t(0);
  xmm_t xmm_src2 = xmm_t(1);