test=develop

629fab49 · jiweibo · a2e0ab60 · 629fab49 · 629fab49 · 629fab49
8 changed file
--- a/lite/backends/host/target_wrapper.cc
+++ b/lite/backends/host/target_wrapper.cc
@@ -34,6 +34,7 @@ void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
  return r;
 }
 void TargetWrapper<TARGET(kHost)>::Free(void* ptr) {
+  ptr=Malloc(1);
  if (ptr) {
    free(static_cast<void**>(ptr)[-1]);
  }

--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -123,14 +123,13 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
    auto size = input.numel() / in_dims[0];
    PADDLE_ENFORCE_EQ(vector.numel(), size);
    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
    const T* input_data = input.data<T>();
    const T* vector_data = vector.data<T>();
    T* output_data = output->mutable_data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
      for (int64_t j = 0; j < size; ++j) {
-        output_data[i * in_dims[0] + j] =
+      output_data[i * size + j] =
-            input_data[i * in_dims[0] + j] + vector_data[j];
+            input_data[i * size + j] + vector_data[j];
      }
    }
  }

--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -23,6 +23,8 @@ void* TargetMalloc(TargetType target, size_t size) {
    case TargetType::kHost:
    case TargetType::kX86:
    case TargetType::kARM:
+      data = TargetWrapper<TARGET(kHost)>::Malloc(size);
+      TargetWrapper<TARGET(kHost)>::Free(data);
      data = TargetWrapper<TARGET(kHost)>::Malloc(size);
      break;
 #ifdef LITE_WITH_CUDA

--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <vector>
 #include "lite/core/program.h"
+#include "lite/fluid/float16.h"
 #ifdef LITE_WITH_OPENCL
 #include "lite/backends/opencl/cl_image_converter.h"

--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
@@ -82,8 +82,7 @@ class FCFunctor {
          memcpy(X1_data + i * KK, X + i * K, K * sizeof(T));
        }
      };
-      lite::x86::RunParallelFor(0, M, parallel_memcpy_x);
+      parallel_memcpy_x(0,M);
      blas.GEMM(false,
                false,
                M,
@@ -104,18 +103,17 @@ class FCFunctor {
            memcpy(Y + i * N, Y1_data + i * NN, N * sizeof(T));
          }
        };
-        lite::x86::RunParallelFor(0, M, parallel_memcpy_y);
+        parallel_memcpy_y(0,M);
        return;
      }
-      lite::x86::RunParallelFor(0, M, parallel_compute);
+      parallel_compute(0,M);
    } else {
      blas.MatMul(M, N, K, X, W, Y);
-      if (!B) {
+     if (!B) {
        return;
      }
+      parallel_compute(0, M);
-      lite::x86::RunParallelFor(0, M, parallel_compute);
    }
  }
 };
@@ -139,7 +137,6 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
    int M = output->dims().production() / w_dims1;
    const T* input_data = input->data<T>();
    const T* w_data = w->data<T>();
    T* output_data = output->mutable_data<T>();

--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
@@ -17,11 +17,18 @@
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/types.h"
+#include <chrono>
+#include "lite/fluid/eigen.h"
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace x86 {
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
 // using Tensor = framework::Tensor;
 inline lite::Tensor ReshapeToMatrix(const lite::Tensor& src, int num_col_dims) {
  int rank = src.dims().size();
@@ -40,9 +47,7 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  using param_t = operators::MulParam;
  void Run() override {
-    auto& context = ctx_->As<X86Context>();
    auto& param = *param_.get_mutable<operators::MulParam>();
-    // CHECK(context.x86_device_context());
    auto* z = param.output;
@@ -50,7 +55,6 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto* y = param.y;
    Tensor x_matrix, y_matrix;
    if (x->dims().size() > 2) {
      x_matrix = ReshapeToMatrix(*x, param.x_num_col_dims);
    } else {
@@ -64,18 +68,10 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
      y_matrix = *y;
    }
-    z->mutable_data<T>();
+Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> mat_test(x_matrix.mutable_data<T>(), x_matrix.dims()[0], x_matrix.dims()[1]);
-    auto z_dim = z->dims();
+Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> mat1_test(y_matrix.mutable_data<T>(), y_matrix.dims()[0], y_matrix.dims()[1]);
-    if (z_dim.size() != 2) {
+Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_test(z->mutable_data<T>(), z->dims()[0], z->dims()[1]);
-      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+out_test = mat_test * mat1_test;
-    }
-    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
-    blas.MatMul(x_matrix, y_matrix, z);
-    if (z_dim.size() != 2) {
-      z->Resize(z_dim);
-    }
  }
  virtual ~MulCompute() = default;

--- a/lite/kernels/x86/sequence_reshape_compute.cc
+++ b/lite/kernels/x86/sequence_reshape_compute.cc
@@ -24,3 +24,14 @@ REGISTER_LITE_KERNEL(
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
    .Finalize();
+REGISTER_LITE_KERNEL(
+    sequence_reshape,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SequenceReshapeFloatCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
+    .Finalize();
--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
@@ -31,21 +31,16 @@ class SequenceReshapeCompute
  void Run() override {
    auto& param = *param_.get_mutable<operators::SequenceReshapeParam>();
-    // auto& context = context_->As<X86Context>();
    auto* in = param.x;
    auto* out = param.output;
    int out_width = param.new_dim;
    const auto& in_dims = in->dims();
    int64_t in_width = in_dims[1];
    auto& in_lod = in->lod();
    CHECK_EQ(in_lod.size(), 1UL);
    CHECK_EQ((uint64_t)in_dims[0], in_lod[0].back());
    auto in_lod_l0 = in_lod[0];
    int seq_num = in_lod_l0.size() - 1;
    if (in_width == out_width) {
      out->set_lod(in->lod());
    } else {
@@ -61,8 +56,7 @@ class SequenceReshapeCompute
        out_lod[0][i + 1] = out_lod[0][i] + offset;
      }
    }
+    out->Resize(std::vector<int64_t>{in->numel()/out_width,
-    out->Resize(std::vector<int64_t>{static_cast<int64_t>(out->lod()[0].back()),
                                     out_width});
    auto* dst_ptr = out->mutable_data<T>();
    auto size = in->numel() * sizeof(T);
@@ -72,6 +66,53 @@ class SequenceReshapeCompute
  virtual ~SequenceReshapeCompute() = default;
 };
+template <typename T>
+class SequenceReshapeFloatCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceReshapeParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SequenceReshapeParam>();
+    auto* in = param.x;
+    auto* out = param.output;
+    auto out_data= out->mutable_data<T>();
+    for(int i=0;i<out->dims().production(); i++){
+      out_data[i] = 0;
+    }
+    int out_width = param.new_dim;
+    const auto& in_dims = in->dims();
+    int64_t in_width = in_dims[1];
+    auto& in_lod = in->lod();
+    CHECK_EQ(in_lod.size(), 1UL);
+    CHECK_EQ((uint64_t)in_dims[0], in_lod[0].back());
+    auto in_lod_l0 = in_lod[0];
+    int seq_num = in_lod_l0.size() - 1;
+    if (in_width == out_width) {
+      out->set_lod(in->lod());
+    } else {
+      auto& out_lod = *out->mutable_lod();
+      out_lod.resize(1);
+      out_lod[0].resize(seq_num + 1);
+      out_lod[0][0] = 0;
+      for (int i = 0; i < seq_num; ++i) {
+        size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i];
+        size_t offset = 0;
+        offset = (seq_len * in_width) / out_width;
+        CHECK_EQ(offset * out_width, seq_len * in_width);
+        out_lod[0][i + 1] = out_lod[0][i] + offset;
+      }
+    }
+    out->Resize(std::vector<int64_t>{in->numel()/out_width,
+                                     out_width});
+    auto* dst_ptr = out->mutable_data<T>();
+    auto size = in->numel() * sizeof(T);
+    std::memcpy(dst_ptr, in->data<T>(), size);
+  }
+  virtual ~SequenceReshapeFloatCompute() = default;
+};
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite