make fc math right

8c7c13f1 · superjomn · dd2bd8e3 · 8c7c13f1 · 8c7c13f1 · 8c7c13f1
5 changed file
--- a/paddle/fluid/lite/core/tensor.cc
+++ b/paddle/fluid/lite/core/tensor.cc
@@ -13,3 +13,33 @@
 // limitations under the License.

 #include "paddle/fluid/lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+std::ostream &operator<<(std::ostream &os, const DDim &dims) {
+  if (dims.empty()) {
+    os << "[]";
+    return os;
+  }
+
+  os << "[";
+  for (int i = 0; i < dims.size() - 1; i++) {
+    os << dims[i] << " ";
+  }
+  os << dims.back() << "]";
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const Tensor &tensor) {
+  os << "Tensor:" << std::endl;
+  os << "dim: " << tensor.dims();
+  for (int i = 0; i < product(tensor.dims()); i++) {
+    os << tensor.data<float>()[i] << " ";
+  }
+  os << "\n";
+  return os;
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/core/tensor.h
+++ b/paddle/fluid/lite/core/tensor.h
@@ -14,8 +14,8 @@

 #pragma once
 #include <algorithm>
-#include <vector>
 #include <numeric>
+#include <vector>
 #include "memory.h"

 namespace paddle {
@@ -91,5 +91,8 @@ class Tensor {
  LoD lod_;
 };

+std::ostream& operator<<(std::ostream& os, const DDim& dims);
+std::ostream& operator<<(std::ostream& os, const Tensor& tensor);
+
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/kernels/host/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/host/fc_compute.cc
@@ -30,25 +30,18 @@ void FcCompute::Run() {

  CHECK_GE(param.input->dims().size(), 2UL);
  CHECK_EQ(param.output->dims().size(), 2UL);
-  Eigen::Map<const matrix_t> input(
-      param.input->data<float>(),
-      product(param.input->dims().begin(),
-              param.input->dims().begin() + param.in_num_col_dims),
+
+  fc_compute_eigen(
+      param.input->data<float>(),  // x
      product(param.input->dims().begin() + param.in_num_col_dims,
-              param.input->dims().end()));
-  Eigen::Map<const matrix_t> weight(param.w->data<float>(), param.w->dims()[0],
-                                    param.w->dims()[1]);
-  matrix_map_t output(param.output->mutable_data<float>(),
-                      param.output->dims()[0], param.output->dims()[1]);
-
-  output = weight.transpose() * input;
-
-  if (param.bias) {
-    Eigen::Map<const matrix_t> bias(param.bias->data<float>(),
-                                    param.bias->dims()[0],
-                                    param.bias->dims()[1]);
-    output += bias;
-  }
+              param.input->dims().end()),  // x_w
+      product(param.input->dims().begin(),
+              param.input->dims().begin() + param.in_num_col_dims),  // x_h
+      param.w->data<float>(),                                        // w
+      param.w->dims()[1],                                            // w_w
+      param.w->dims()[0],                                            // w_h
+      param.bias->data<float>(),                                     // b
+      param.output->mutable_data<float>());
 }

 TargetType FcCompute::target() const { return TARGET(kHost); }

--- a/paddle/fluid/lite/kernels/host/fc_compute.h
+++ b/paddle/fluid/lite/kernels/host/fc_compute.h
@@ -13,6 +13,8 @@
 // limitations under the License.

 #pragma once
+#include <glog/logging.h>
+#include <Eigen/Core>
 #include "paddle/fluid/lite/core/kernel.h"
 #include "paddle/fluid/lite/operators/fc_op.h"

@@ -33,6 +35,52 @@ class FcCompute : public OpKernel<TARGET(kHost), PRECISION(kFloat)> {
  virtual ~FcCompute() = default;
 };

+template <typename T>
+void fc_compute_eigen(const T* x, int x_w, int x_h,  //
+                      const T* w, int w_w, int w_h,  //
+                      const T* b,                    //
+                      T* out) {
+  using matrix_t =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+  Eigen::Map<const matrix_t> X(x, x_h, x_w);
+  Eigen::Map<const matrix_t> W(w, w_h, w_w);
+  Eigen::Map<matrix_t> Out(out, x_h, w_h);
+
+  Out = X * W.transpose();
+
+  if (b) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_h);
+    Out = Out.array().rowwise() + B.transpose().array();
+  }
+}
+
+template <typename T>
+__attribute__((optimize("unroll-loops")))  //
+T dot(const T* x, const T* y, int dim) {
+  T out{};
+  for (int i = 0; i < dim; i++) {
+    out += x[i] * y[i];
+  }
+  return out;
+}
+
+template <typename T>
+void fc_compute_naive(const T* x, int x_w, int x_h,  //
+                      const T* w, int w_w, int w_h,  //
+                      const T* b,                    //
+                      T* out) {
+  CHECK_EQ(x_w, w_w);
+  // out shape: (x_h, w_w)
+  memset(out, 0, x_h * w_h * sizeof(T));
+
+  for (int r = 0; r < x_h; r++) {
+    for (int c = 0; c < w_h; c++) {
+      out[r * w_h + c] = dot(&x[r * x_w], &w[c * w_w], w_w) + b[c];
+    }
+  }
+}
+
 }  // namespace host
 }  // namespace kernels
 }  // namespace lite

--- a/paddle/fluid/lite/kernels/host/fc_compute_test.cc
+++ b/paddle/fluid/lite/kernels/host/fc_compute_test.cc
@@ -22,6 +22,37 @@ namespace lite {
 namespace kernels {
 namespace host {

+TEST(fc_compute_naive, test) {
+  Tensor x, w, b, out, out1;
+  const int batch_size = 2;
+  x.Resize({batch_size, 3});
+  w.Resize({4, 3});
+  b.Resize({1, 4});
+  out.Resize({batch_size, 4});
+  out1.Resize({batch_size, 4});
+
+  auto x_data = x.mutable_data<float>();
+  auto w_data = w.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  auto out_data1 = out1.mutable_data<float>();
+
+  for (int i = 0; i < product(x.dims()); i++) x_data[i] = i;
+  for (int i = 0; i < product(w.dims()); i++) w_data[i] = i;
+  for (int i = 0; i < product(b.dims()); i++) b_data[i] = i;
+
+  fc_compute_naive(x_data, 3, batch_size,  //
+                   w_data, 3, 4,           //
+                   b_data, out_data);
+  fc_compute_eigen(x_data, 3, batch_size,  //
+                   w_data, 3, 4,           //
+                   b_data, out_data1);
+
+  for (int i = 0; i < product(out.dims()); i++) {
+    EXPECT_NEAR(out_data[0], out_data1[0], 1e-6);
+  }
+}
+
 TEST(fc_host, init) {
  FcCompute fc;
  ASSERT_EQ(fc.precision(), PRECISION(kFloat));