fix bugs

cf5ea925 · tensor-tang · 6ed20474 · cf5ea925 · cf5ea925 · cf5ea925
3 changed file
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -15,12 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
+// #include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
+// #include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
 namespace paddle {
 namespace operators {
@@ -233,6 +230,13 @@ use lstm_x_t as input and compute as standard LSTM.
 )DOC");
 }
+template <typename T>
+inline void vec_relu(const int n, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
 // y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0;
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
@@ -240,14 +244,14 @@ inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
    for (int i = 0; i < n; ++i) {
      y[i] = x[i] + bias[0];
    }
-    vec_relu(n, y, y);
+    vec_relu<T>(n, y, y);
  } else {
-    vec_relu(n, x, y);
+    vec_relu<T>(n, x, y);
  }
 }
 template <typename DeviceContext, typename T>
-inline void vec_softmax(const BlasT<DeviceContext, T>& blas, const int n,
+inline void vec_softmax(const math::BlasT<DeviceContext, T>& blas, const int n,
                        const T* x, T* y) {
  T scalar = x[0];
  // max
@@ -257,7 +261,7 @@ inline void vec_softmax(const BlasT<DeviceContext, T>& blas, const int n,
  // sub
  for (int i = 0; i < n; ++i) {
-    y[c] = x[c] - alpha;
+    y[i] = x[i] - scalar;
  }
  // exp
@@ -270,57 +274,45 @@ inline void vec_softmax(const BlasT<DeviceContext, T>& blas, const int n,
  }
  // scale
-  blas.VSCAL(n, static_cast<T>(1) / scalar, y);
+  blas.SCAL(n, static_cast<T>(1) / scalar, y);
 }
-__m256 exp(__m256 a) { return exp256_ps(a); }
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
-__m256 log(__m256 a) { return log256_ps(a); }
+template <typename T>
+inline T sigmoid(T x) {
-__m256 sin(__m256 a) { return sin256_ps(a); }
+  return 1. / (1. + exp(-x));
-__m256 cos(__m256 a) { return cos256_ps(a); }
-__m256 relu(const __m256 a) {
-  __m256 tmp = _mm256_set1_ps(0.0f);
-  return _mm256_max_ps(a, tmp);
 }
-__m256 sigmoid(const __m256 a) {
+template <typename T>
-  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+inline T tanh(T x) {
-  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  return 2. * sigmoid(2. * x) - 1.;
-  __m256 tmp = _mm256_max_ps(a, min);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
-  tmp = exp(tmp);
-  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
-  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
-  return tmp;
 }
-__m256 tanh(const __m256 a) {
+template <typename T>
-  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+inline void vec_sigmoid(const int n, const T* x, T* y) {
-  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+  const T min = SIGMOID_THRESHOLD_MIN;
-  tmp = _mm256_min_ps(tmp, max);
+  const T max = SIGMOID_THRESHOLD_MAX;
-  tmp = exp(tmp);
+  for (int i = 0; i < n; ++i) {
-  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+    y[i] = 1.0 / (1.0 + std::exp(-tmp));
-                       _mm256_set1_ps(1.0f));
+  }
 }
-__m256 linear(const __m256 a) { return a; }
+template <typename T>
+inline void vec_tanh(const int n, const T* x, T* y) {
-inline void vec_sigmoid(const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
-  const real min = SIGMOID_THRESHOLD_MIN;
+    y[i] = tanh<T>(x[i]);
-  const real max = SIGMOID_THRESHOLD_MAX;
+  }
-  real tmp = (a < min) ? min : ((a > max) ? max : a);
-  return 1.0 / (1.0 + exp(-tmp));
 }
-template <typename DeviceContext, typename T>
+template <typename T>
 class AttentionLSTMKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
    auto* x = ctx.Input<LoDTensor>("X");                        // T x M
    auto* h0 = ctx.Input<Tensor>("H0");                         // N x D
    auto* c0 = ctx.Input<Tensor>("C0");                         // N x D
@@ -334,7 +326,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");   // TxD
    auto* cell_out = ctx.Output<LoDTensor>("Cell");       // TxD
    auto* atted_x = ctx.Output<Tensor>("AttentionedX");   // T x 1
-    auto* fc_out = ctx.Output<Tensor>('AttentionFCOut');  // max_seq_len x 1
+    auto* fc_out = ctx.Output<Tensor>("AttentionFCOut");  // max_seq_len x 1
    auto* lstm_x = ctx.Output<Tensor>("LSTMX");           // 1 x M
    auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");       // 1 x 4D
@@ -342,9 +334,10 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
    auto x_lod = x->lod();
    const int N = x_lod[0].size() - 1;  // batch size
    auto x_dims = x->dims();            // T x M
-    auto w_dims = w->dims();            // (D+M) x 4D
+    auto w_dims = lstm_w->dims();       // (D+M) x 4D
-    const int M = x_dims[1];            // x frame size
+    const int total_T = x_dims[0];
-    const int D = w_dims[1] / 4;        // gate frame size
+    const int M = x_dims[1];      // x frame size
+    const int D = w_dims[1] / 4;  // gate frame size
    const int D2 = D * 2;
    const int D3 = D * 3;
    const int D4 = w_dims[1];
@@ -357,6 +350,8 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
    fc_out->Resize({max_seq_len, 1});
+    // TODO(TJ): act functor init here
    const T* x_data = x->data<T>();
    const T* h0_data = h0->data<T>();
    const T* c0_data = c0->data<T>();
@@ -368,16 +363,16 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
    const T* atten_scalar_bias_data =
        atten_scalar_bias ? atten_scalar_bias->data<T>() : NULL;
-    T* hidden_out_data = hidden_out->mutable_data<T>();
+    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
-    T* cell_out_data = cell_out->mutable_data<T>();
+    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
-    T* atted_x_data = atted_x->mutable_data<T>();
+    T* atted_x_data = atted_x->mutable_data<T>(ctx.GetPlace());
-    T* fc_out_data = fc_out->mutable_data<T>();
+    T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
-    T* lstm_x_data = lstm_x->mutable_data<T>();
+    T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
-    T* lstm_out_data = lstm_out->mutable_data<T>();
+    T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());
    // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-    math::FCCompute<DeviceContext, T>(blas, T, 1, M, x_data, atten_w_data,
+    math::FCCompute<DeviceContext, T>(blas, total_T, 1, M, x_data, atten_w_data,
                                      atted_x_data, atten_b_data);
    const T* cur_x_data = x_data;
@@ -400,7 +395,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
        // fc2: scalar
        if (atten_scalar_data) {
          // x = a*x
-          blas.SCAL(seq_len, atten_scalar_data, fc_out_data);
+          blas.SCAL(seq_len, *atten_scalar_data, fc_out_data);
          bias_relu<T>(seq_len, fc_out_data, atten_scalar_bias_data,
                       fc_out_data);
        }
@@ -431,16 +426,16 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
        blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data);
        // b = input * tilde
-        blas.VMUL(D, lstm_out_data + D, lstm_out + D3, lstm_out_data + D);
+        blas.VMUL(D, lstm_out_data + D, lstm_out_data + D3, lstm_out_data + D);
        // cell_out = a + b
        blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data);
        // state act tanh(cell_out) * output_gate
        vec_tanh(D, cur_cell_out_data, lstm_out_data);
-        blas.VMUL(D, lstm_out_data, lstm_out + D2, cur_hidden_out_data);
+        blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data);
-        prev_hidden_data = hidden_out + i * gate_size;
+        prev_hidden_data = cur_hidden_out_data;
        prev_cell_data = cur_cell_out_data;
        cur_cell_out_data = cur_cell_out_data + D;
        cur_hidden_out_data = cur_hidden_out_data + D;
@@ -458,7 +453,5 @@ REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp,
                  ops::AttentionLSTMOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OP_CPU_KERNEL(
+REGISTER_OP_CPU_KERNEL(attention_lstm, ops::AttentionLSTMKernel<float>,
-    attention_lstm,
+                       ops::AttentionLSTMKernel<double>);
-    ops::AttentionLSTMKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AttentionLSTMKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -160,7 +160,7 @@ class Blas {
  T DOT(int n, const T* x, const T* y) const;
  template <typename T>
-  void SCAL(int n, const T a, const T* x) const;
+  void SCAL(int n, const T a, T* x) const;
  template <typename T>
  void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
@@ -233,11 +233,26 @@ class BlasT : private Blas<DeviceContext> {
    Base()->template VCOPY<T>(args...);
  }
+  template <typename... ARGS>
+  void VEXP(ARGS... args) const {
+    Base()->template VEXP<T>(args...);
+  }
  template <typename... ARGS>
  void GEMV(ARGS... args) const {
    Base()->template GEMV<T>(args...);
  }
+  template <typename... ARGS>
+  T DOT(ARGS... args) const {
+    return Base()->template DOT<T>(args...);
+  }
+  template <typename... ARGS>
+  void SCAL(ARGS... args) const {
+    Base()->template SCAL<T>(args...);
+  }
  template <typename... ARGS>
  void BatchedGEMM(ARGS... args) const {
    Base()->template BatchedGEMM<T>(args...);

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -415,8 +415,7 @@ T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
 template <>
 template <typename T>
-void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a,
+void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
-                                            const T *x) const {
 #ifdef PADDLE_WITH_MKLML
  CBlas<T>::SCAL(n, a, x, 1);
 #else