add cross vec to speedup gru

b0d36c4c · tensor-tang · 038c16ee · b0d36c4c · b0d36c4c
隐藏空白更改
内联并排

Showing with 65 addition and 12 deletion

paddle/fluid/operators/fusion_gru_op.cc paddle/fluid/operators/fusion_gru_op.cc +6 -12

paddle/fluid/operators/math/cpu_vec.h paddle/fluid/operators/math/cpu_vec.h +59 -0

未找到文件。
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -172,19 +172,19 @@ class FusionGRUKernel : public framework::OpKernel<T> {

    bool is_reverse = ctx.Attr<bool>("is_reverse");
    std::function<void(const int, const T *, T *)> act_gate, act_state;
-    std::function<void(const int, const T, const T*, T*)> bias_sub;
+    std::function<void(const int, const T*, const T*, const T*, T*)> cross;
    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
    auto& act_state_str = ctx.Attr<std::string>("activation");
    if (platform::jit::MayIUse(platform::jit::avx)) {
      math::VecActivations<T, platform::jit::avx> act_functor;
      act_gate = act_functor(act_gate_str);
      act_state = act_functor(act_state_str);
-      bias_sub = math::vec_bias_sub<T, platform::jit::avx>;
+      cross = math::vec_cross<T, platform::jit::avx>;
    } else {
      math::VecActivations<T, platform::jit::isa_any> act_functor;
      act_gate = act_functor(act_gate_str);
      act_state = act_functor(act_state_str);
-      bias_sub = math::vec_bias_sub<T, platform::jit::isa_any>;
+      cross = math::vec_cross<T, platform::jit::isa_any>;
    }

    const T* x_data = x->data<T>();
@@ -288,15 +288,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
      for (int i = 0; i < cur_bs; ++i) {
        // ht~ = act_state(...)
        act_state(D, cur_batched_data + D2, cur_batched_data + D2);
-        // ht~~ = zt*ht~ inplace result
-        blas.VMUL(D, cur_batched_data, cur_batched_data + D2,
-                  cur_batched_data + D2);
-        // zt = 1 - zt inplace result
-        bias_sub(D, static_cast<T>(1), cur_batched_data, cur_batched_data);
-        // zt = ht_1 * zt
-        blas.VMUL(D, cur_prev_hidden_data, cur_batched_data, cur_batched_data);
-        // out = zt + ht~~
-        blas.VADD(D, cur_batched_data, cur_batched_data + D2, cur_out_data);
+        // out = zt*ht~ + (1-zt)*ht_1
+        cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data,
+              cur_out_data);

        cur_batched_data += D3;
        cur_prev_hidden_data += D;

--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -188,6 +188,65 @@ inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
  vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
 }

+// out = x*y + (1-x)*z
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
+  for (int i = 0; i < n; ++i) {
+    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
+  }
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
+                                                 const float* y, const float* z,
+                                                 float* out) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(1.f);
+  __m256 tmpx, tmpy, tmpz;
+  for (i = 0; i < end; i += block) {
+    tmpx = _mm256_loadu_ps(x + i);
+    tmpy = _mm256_loadu_ps(y + i);
+    tmpz = _mm256_loadu_ps(z + i);
+    tmpy = _mm256_mul_ps(tmpx, tmpy);
+    tmpx = _mm256_sub_ps(bias, tmpx);
+    tmpz = _mm256_mul_ps(tmpx, tmpz);
+    tmpz = _mm256_add_ps(tmpy, tmpz);
+    _mm256_storeu_ps(out + i, tmpz);
+  }
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
+  }
+#else
+  vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+#endif
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
+                                                  const float* y,
+                                                  const float* z, float* out) {
+  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx512_common>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+  // TODO(TJ): enable me
+  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+}
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {