refine fc and use the fc compute in fusion_lstm

f3cd2612 · tensor-tang · 40138c4c · f3cd2612 · f3cd2612 · f3cd2612
4 changed file
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fc_op.h"
 #include <vector>
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
-DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace operators {
@@ -110,13 +109,8 @@ void FCOpMaker::Make() {
  AddComment(R"DOC(
  Fully Connected Operator.
-  The fully connected operation calculates the output based on the input, weights and bias attribute.
+  The fully connected operation calculates the output based on the input, weights and bias.
  The size of each dimension of the parameters checked in the infer-shape.
-  The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
-  Additional parametrs are use_mkldnn and bias_attr.
-  The input(X) size and output(Out) size may be diffrent.
-  The fully connected layer only supports MKLDNN version
 )DOC");
 }
@@ -133,26 +127,13 @@ class FCOpKernel : public framework::OpKernel<T> {
    auto in_dims = input->dims();
    auto w_dims = w->dims();
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(dev_ctx);
    const T* input_data = input->data<T>();
    const T* w_data = w->data<T>();
    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    blas.GEMM(CblasNoTrans, CblasNoTrans, in_dims[0], w_dims[1], w_dims[0],
+    math::FCCompute<platform::CPUDeviceContext, T>(
-              static_cast<T>(1), input_data, w_data, static_cast<T>(0),
+        blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data,
-              output_data);
+        bias ? bias->data<T>() : NULL);
-    if (bias) {
-      const T* bias_data = bias->data<T>();
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
-#endif
-      for (int bs = 0; bs < in_dims[0]; bs++) {
-        blas.AXPY(w_dims[1], static_cast<T>(1), bias_data,
-                  output_data + bs * w_dims[1]);
-      }
-    }
  }
 };

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -16,9 +16,9 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
-DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace operators {
@@ -205,23 +205,6 @@ inline void ReorderInitState(const DeviceContext& ctx,
  row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
-// TODO(TJ): can move to math::details
-template <typename DeviceContext, typename T>
-inline void SimpleFC(const math::BlasT<DeviceContext, T>& blas, const int M,
-                     const int N, const int K, const T* A, const T* B, T* C,
-                     const T* bias_data = NULL) {
-  blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast<T>(1), A, B,
-            static_cast<T>(0), C);
-  if (bias_data) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
-#endif
-    for (int i = 0; i < M; i++) {
-      blas.AXPY(N, static_cast<T>(1), bias_data, C + i * N);
-    }
-  }
-}
 template <typename DeviceContext, typename T>
 class FuisonLSTMKernel : public framework::OpKernel<T> {
 public:
@@ -253,14 +236,15 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
    if (x_dims[1] > wx_dims[1]) {
-      SimpleFC<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1], x_data,
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                 wx_data, xx_data, bias->data<T>());
+                                        x_data, wx_data, xx_data,
+                                        bias->data<T>());
      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
    } else {
      to_batch(dev_ctx, *x, xx, true, is_reverse);
-      SimpleFC<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                 xx_data, wx_data, batched_gate_data,
+                                        xx_data, wx_data, batched_gate_data,
-                                 bias->data<T>());
+                                        bias->data<T>());
    }
    int frame_size = static_cast<int>(wx_dims[1] / 4);

--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/operators/math/blas.h"
+DECLARE_int32(paddle_num_threads);
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename DeviceContext, typename T>
+inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
+                      const int N, const int K, const T* X, const T* W, T* Y,
+                      const T* B = NULL) {
+  blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast<T>(1), X, W,
+            static_cast<T>(0), Y);
+  if (B) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
+#endif
+    for (int i = 0; i < M; i++) {
+      blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -64,27 +64,47 @@ class TestFCOp(OpTest):
        self.check_output()
-class TestFCOpBiasBoth(TestFCOp):
+class TestFCOpNoBias(TestFCOp):
    def init_shapes(self, mb, ic, oc, h, w):
-        for with_bias in {True, False}:
+        self.with_bias = False
-            self.with_bias = with_bias
+        self.matrix = MatrixGenerate(mb, ic, oc, h, w)
-            self.matrix = MatrixGenerate(mb, ic, oc, h, w)
-class TestFCOp1(TestFCOpBiasBoth):
+class TestFCOpWithBias(TestFCOp):
+    def init_shapes(self, mb, ic, oc, h, w):
+        self.with_bias = True
+        self.matrix = MatrixGenerate(mb, ic, oc, h, w)
+class TestFCOp1(TestFCOpNoBias):
    def init_op_type(self):
        self.init_shapes(2, 8, 10, 1, 1)
-class TestFCOp2(TestFCOpBiasBoth):
+class TestFCOp2(TestFCOpNoBias):
    def init_op_type(self):
        self.init_shapes(4, 5, 6, 2, 2)
-class TestFCOp4(TestFCOpBiasBoth):
+class TestFCOp4(TestFCOpNoBias):
    def init_op_type(self):
        self.init_shapes(1, 32, 64, 3, 3)
+class TestFCOpWithBias1(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(3, 8, 10, 2, 1)
+class TestFCOpWithBias2(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(4, 5, 6, 2, 2)
+class TestFCOpWithBias3(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(1, 64, 32, 3, 3)
 if __name__ == "__main__":
    unittest.main()