add ops and kernels that mul, scale, fc, relu, softmax, dropout, elem… (#17711)

* fix conflicts * fix kernel registry realted bugs test=develop

add ops and kernels that mul, scale, fc, relu, softmax, dropout, elem… (#17711)
* fix conflicts * fix kernel registry realted bugs test=develop
202a015b · lijianshe02 · GitHub · a55c1510 · 202a015b · 202a015b
25 changed file
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
+// option optimize_for = LITE_RUNTIME;
 package paddle.framework.proto;

 // Any incompatible changes to ProgramDesc and its dependencies should

--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
@@ -25,10 +25,14 @@ set(LITE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inferenc
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
        "A path setting inference demo download directories.")

-# lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
-#   DEPS cxx_api_lite model_parser_lite target_wrapper_host
-#   ${ops_lite} ${host_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
-#         --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
+if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
+   DEPS cxx_api_lite model_parser_lite target_wrapper_host
+   ${ops_lite} ${host_kernels} ${x86_kernels}
+   ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
+        --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
+add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
+endif(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)

 if(WITH_TESTING)
 lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")

--- a/paddle/fluid/lite/api/cxx_api_test.cc
+++ b/paddle/fluid/lite/api/cxx_api_test.cc
@@ -32,7 +32,8 @@ namespace lite {
 TEST(CXXApi, test) {
  lite::ExecutorLite predictor;
 #ifndef LITE_WITH_CUDA
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)}});
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
 #else
  std::vector<Place> valid_places({
      Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
@@ -44,7 +45,8 @@ TEST(CXXApi, test) {
  });
 #endif

-  predictor.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},
+  predictor.Build(FLAGS_model_dir,
+                  Place{TARGET(kX86), PRECISION(kFloat)},  // origin cuda
                  valid_places);

  auto* input_tensor = predictor.GetInput(0);
@@ -69,7 +71,8 @@ TEST(CXXApi, test) {
 #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(CXXApi, save_model) {
  lite::ExecutorLite predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)}});
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
  predictor.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},
                  valid_places);

@@ -78,7 +81,7 @@ TEST(CXXApi, save_model) {
 #endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK

 #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-TEST(CXXTrainer, train) {
+/*TEST(CXXTrainer, train) {
  Place prefer_place({TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)});
  std::vector<Place> valid_places({prefer_place});
  auto scope = std::make_shared<lite::Scope>();
@@ -108,7 +111,7 @@ TEST(CXXTrainer, train) {
  data0[0] = 0;

  exe.Run();
-}
+}*/
 #endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK

 }  // namespace lite
@@ -116,13 +119,31 @@ TEST(CXXTrainer, train) {

 USE_LITE_OP(mul);
 USE_LITE_OP(fc);
+USE_LITE_OP(relu);
 USE_LITE_OP(scale);
 USE_LITE_OP(feed);
 USE_LITE_OP(fetch);
 USE_LITE_OP(io_copy);
+USE_LITE_OP(elementwise_add)
+USE_LITE_OP(elementwise_sub)
+USE_LITE_OP(square)
+USE_LITE_OP(softmax)
+USE_LITE_OP(dropout)
 USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);

+#ifdef LITE_WITH_X86
+USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
+#endif
+
 #ifdef LITE_WITH_CUDA
 USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
 USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);

--- a/paddle/fluid/lite/core/context.h
+++ b/paddle/fluid/lite/core/context.h
@@ -95,7 +95,11 @@ struct CUDAContext {
 #ifdef LITE_WITH_X86
 struct X86Context {
  // overall information
-
+  X86Context() {
+    x86_device_context.reset(new ::paddle::platform::CPUDeviceContext);
+    x86_execution_context.reset(
+        new ::paddle::framework::ExecutionContext(*x86_device_context));
+  }
  // kernel information

  // legacy info.

--- a/paddle/fluid/lite/core/mir/runtime_context_assign_pass.cc
+++ b/paddle/fluid/lite/core/mir/runtime_context_assign_pass.cc
@@ -34,9 +34,13 @@ class RuntimeContextAssignPass : public StmtPass {
      auto& inst = node.AsStmt();
      switch (inst.picked_kernel().target()) {
        case TARGET(kHost):
-        case TARGET(kX86):
          inst.picked_kernel().SetContext(NewHostContext());
          break;
+#ifdef LITE_WITH_X86
+        case TARGET(kX86):
+          inst.picked_kernel().SetContext(NewX86Context());
+          break;
+#endif
 #ifdef LITE_WITH_CUDA
        case TARGET(kCUDA):
          inst.picked_kernel().SetContext(NewCudaContext());
@@ -61,6 +65,13 @@ class RuntimeContextAssignPass : public StmtPass {

    return ctx;
  }
+#ifdef LITE_WITH_X86
+  std::unique_ptr<KernelContext> NewX86Context() {
+    std::unique_ptr<KernelContext> ctx(new KernelContext);
+    ctx->As<X86Context>();
+    return ctx;
+  }
+#endif

 #ifdef LITE_WITH_ARM
  std::unique_ptr<KernelContext> NewARMContext() {

--- a/paddle/fluid/lite/core/op_registry.cc
+++ b/paddle/fluid/lite/core/op_registry.cc
@@ -91,6 +91,10 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kHost, kAny, kNCHW);
  INIT_FOR(kHost, kAny, kAny);

+  INIT_FOR(kX86, kFloat, kNCHW);
+  INIT_FOR(kX86, kAny, kNCHW);
+  INIT_FOR(kX86, kAny, kAny);
+
  INIT_FOR(kARM, kFloat, kNCHW);
  INIT_FOR(kARM, kAny, kNCHW);
  INIT_FOR(kARM, kAny, kAny);

--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -3,18 +3,29 @@ if(NOT LITE_WITH_X86)
 endif()

 cc_library(activation_compute_x86 SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_op)
-cc_library(elementwise_compute_x86 SRCS elementwise_compute.cc DEPS ${lite_kernel_deps})
 cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
 cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
-cc_library(mul_compute_x86 SRCS mul_compute.cc DEPS ${lite_kernel_deps})
 cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps})

+cc_library(fc_compute_x86 SRCS fc_compute.cc DEPS ${lite_kernel_deps})
+cc_library(mul_compute_x86 SRCS mul_compute.cc DEPS ${lite_kernel_deps})
+cc_library(relu_compute_x86 SRCS relu_compute.cc DEPS ${lite_kernel_deps})
+cc_library(scale_compute_x86 SRCS scale_compute.cc DEPS ${lite_kernel_deps})
+cc_library(elementwise_compute_x86 SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} elementwise_sub_op elementwise_add_op)
+cc_library(softmax_compute_x86 SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
+cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} )
+
 set(x86_kernels
    activation_compute_x86
    elementwise_compute_x86
    mean_compute_x86
    fill_constant_compute_x86
    mul_compute_x86
+    relu_compute_x86
+    fc_compute_x86
+    scale_compute_x86
+    softmax_compute_x86 
+    dropout_compute_x86
    )

 set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
--- a/paddle/fluid/lite/kernels/x86/dropout_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/dropout_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <random>
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DropoutParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::DropoutParam>();
+    const auto* x_data = param.x->data<T>();
+    auto* out_data = param.output->template mutable_data<T>();
+    if (!param.is_test) {
+      auto* mask_data = param.mask->template mutable_data<T>();
+      std::random_device rnd;
+      std::minstd_rand engine;
+      int seed = param.fix_seed ? param.seed : rnd();
+      engine.seed(seed);
+      std::uniform_real_distribution<float> dist(0, 1);
+
+      size_t size = framework::product(param.mask->dims().data());
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(engine) < param.dropout_prob) {
+          mask_data[i] = 0;
+          out_data[i] = 0;
+        } else {
+          if (param.dropout_implementation == "upscale_in_train") {
+            mask_data[i] = 1.0f / static_cast<T>(1.0f - param.dropout_prob);
+            out_data[i] = x_data[i] / static_cast<T>(1.0f - param.dropout_prob);
+          } else {
+            mask_data[i] = 1;
+            out_data[i] = x_data[i];
+          }
+        }
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(param.x->raw_tensor(), 1);
+      auto Y = EigenMatrix<T>::Reshape(param.output->raw_tensor(), 1);
+      auto& place = *platform::CPUDeviceContext().eigen_device();
+      if (param.dropout_implementation == "upscale_in_train") {
+        Y.device(place) = X;
+      } else {
+        Y.device(place) = X * static_cast<T>(1.0f - param.dropout_prob);
+      }
+    }
+  }
+
+  virtual ~DropoutCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(dropout, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::DropoutCompute<float>, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/x86/elementwise_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/elementwise_compute.cc
@@ -30,6 +30,11 @@ struct SubFunctor {
  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
 };

+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
 template <typename T>
 class ElementwiseSubCompute
    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
@@ -67,10 +72,9 @@ class ElementwiseSubGradCompute
    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 public:
  using param_t = operators::ElementwiseGradParam;
-
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
-    auto& context = context_->As<X86Context>();
+    auto& context = ctx_->As<X86Context>();
    CHECK(context.x86_device_context);

    param.X_grad->template mutable_data<T>();
@@ -89,6 +93,26 @@ class ElementwiseSubGradCompute
  virtual ~ElementwiseSubGradCompute() = default;
 };

+template <typename T>
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    CHECK(context.x86_device_context);
+    param.Out->template mutable_data<T>();
+    paddle::operators::ElementwiseComputeEx<AddFunctor<T>,
+                                            platform::CPUDeviceContext, T>(
+        *context.x86_execution_context, &param.X->raw_tensor(),
+        &param.Y->raw_tensor(), param.axis, AddFunctor<T>(),
+        &param.Out->raw_tensor());
+  }
+
+  virtual ~ElementwiseAddCompute() = default;
+};
+
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
@@ -113,3 +137,11 @@ REGISTER_LITE_KERNEL(elementwise_sub_grad, kX86, kFloat, kNCHW,
    .BindOutput(paddle::framework::GradVarName("Y"),
                {LiteType::GetTensorTy(TARGET(kX86))})
    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::ElementwiseAddCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/x86/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/fc_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+#include "paddle/fluid/lite/operators/fc_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+void fc_compute_eigen(const T* x, int x_w, int x_h,  //
+                      const T* w, int w_w, int w_h,  //
+                      const T* b,                    //
+                      T* out) {
+  using matrix_t =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+  Eigen::Map<const matrix_t> X(x, x_h, x_w);
+  Eigen::Map<const matrix_t> W(w, w_h, w_w);
+  Eigen::Map<matrix_t> Out(out, x_h, w_h);
+
+  Out = X * W.transpose();
+
+  if (b) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_h);
+    Out = Out.array().rowwise() + B.transpose().array();
+  }
+}
+
+template <typename T>
+__attribute__((optimize("unroll-loops")))  //
+T dot(const T* x, const T* y, int dim) {
+  T out{};
+  for (int i = 0; i < dim; i++) {
+    out += x[i] * y[i];
+  }
+  return out;
+}
+
+template <typename T>
+void fc_compute_naive(const T* x, int x_w, int x_h,  //
+                      const T* w, int w_w, int w_h,  //
+                      const T* b,                    //
+                      T* out) {
+  CHECK_EQ(x_w, w_w);
+  // out shape: (x_h, w_w)
+  memset(out, 0, x_h * w_h * sizeof(T));
+
+  for (int r = 0; r < x_h; r++) {
+    for (int c = 0; c < w_h; c++) {
+      out[r * w_h + c] = dot(&x[r * x_w], &w[c * w_w], w_w) + b[c];
+    }
+  }
+}
+
+template <typename T>
+class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::FcParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    CHECK_GE(param.input->dims().size(), 2UL);
+    CHECK_EQ(param.output->dims().size(), 2UL);
+
+    fc_compute_eigen(
+        param.input->data<T>(),  // x
+        param.input->dims().Slice(0, param.in_num_col_dims).production(),
+        param.input->dims()
+            .Slice(param.in_num_col_dims, param.input->dims().size())
+            .production(),
+        param.w->data<T>(),     // w
+        param.w->dims()[1],     // w_w
+        param.w->dims()[0],     // w_h
+        param.bias->data<T>(),  // b
+        param.output->mutable_data<T>());
+  }
+
+  virtual ~FcCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(fc, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::FcCompute<float>, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/x86/fill_constant_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/fill_constant_compute.cc
@@ -31,7 +31,7 @@ class FillConstantCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {

  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
-    auto& context = context_->As<X86Context>();
+    auto& context = ctx_->As<X86Context>();
    CHECK(context.x86_device_context);

    param.Out->template mutable_data<T>();

--- a/paddle/fluid/lite/kernels/x86/mean_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/mean_compute.cc
@@ -37,7 +37,7 @@ class MeanCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {

  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
-    auto& context = context_->As<X86Context>();
+    auto& context = ctx_->As<X86Context>();
    CHECK(context.x86_device_context);

    param.Out->template mutable_data<T>();
@@ -59,7 +59,7 @@ class MeanGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {

  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
-    auto& context = context_->As<X86Context>();
+    auto& context = ctx_->As<X86Context>();
    CHECK_EQ(param.Out_grad->raw_tensor().numel(), 1);
    CHECK(context.x86_device_context);


--- a/paddle/fluid/lite/kernels/x86/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/mul_compute.cc
@@ -30,7 +30,7 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  using param_t = operators::MulParam;

  void Run() override {
-    auto& context = context_->As<X86Context>();
+    auto& context = ctx_->As<X86Context>();
    auto& param = *param_.get_mutable<operators::MulParam>();
    CHECK(context.x86_device_context);

@@ -68,7 +68,7 @@ template <typename T>
 class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 public:
  void Run() override {
-    auto& context = context_->As<X86Context>();
+    auto& context = ctx_->As<X86Context>();
    auto& param = *param_.get_mutable<operators::MulGradParam>();
    CHECK(context.x86_device_context);


--- a/paddle/fluid/lite/kernels/x86/relu_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/relu_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+#include "paddle/fluid/lite/operators/relu_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ReluParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto n = param.input->dims().production();
+    const float* input = param.input->data<float>();
+    float* output = param.output->mutable_data<float>();
+    for (int i = 0; i < n; i++) {
+      output[i] = std::max(0.f, input[i]);
+    }
+  }
+
+  virtual ~ReluCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(relu, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::ReluCompute<float>, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/x86/scale_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/scale_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+#include "paddle/fluid/lite/operators/relu_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+void scale_compute(const T* x, T* out, int size, float scale, float bias,
+                   bool bias_before) {
+  if (bias_before) bias *= scale;
+  for (int i = 0; i < size; i++) {
+    out[i] = x[i] * scale + bias;
+  }
+}
+
+template <typename T>
+class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ScaleParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    scale_compute(param.x->data<T>(), param.output->mutable_data<T>(),
+                  param.x->dims().production(), param.scale, param.bias,
+                  param.bias_after_scale);
+  }
+
+  virtual ~ScaleCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(scale, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::ScaleCompute<float>, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/x86/softmax_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/softmax_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+
+static inline int SizeToAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeFromAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (int i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+template <typename T>
+class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SoftmaxParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SoftmaxParam>();
+    // auto& context = context_->As<X86Context>();
+    CHECK(param.output);
+    CHECK(param.x);
+    const int rank = param.x->dims().size();
+    const int axis = CanonicalAxis(param.axis, rank);
+    int axis_dim = param.x->dims()[axis];
+    const int n = SizeToAxis(axis, param.x->dims());
+    const int d = SizeFromAxis(axis, param.x->dims());
+    std::vector<int64_t> shape{n, d};
+
+    lite::Tensor input_2d, out_2d;
+    input_2d.ShareDataWith(*param.x);
+    input_2d.Resize(lite::DDim(shape));
+    out_2d.ShareDataWith(*param.output);
+    out_2d.Resize(lite::DDim(shape));
+
+    paddle::operators::math::SoftmaxFunctor<platform::CPUDeviceContext, T,
+                                            true>()(
+        platform::CPUDeviceContext(), axis_dim, &input_2d.raw_tensor(),
+        &out_2d.raw_tensor());
+  }
+
+  virtual ~SoftmaxCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(softmax, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::SoftmaxCompute<float>, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/paddle/fluid/lite/operators/CMakeLists.txt
+++ b/paddle/fluid/lite/operators/CMakeLists.txt
@@ -13,8 +13,9 @@ cc_library(elementwise_ops_lite SRCS elementwise_ops.cc DEPS ${op_DEPS})
 cc_library(mean_op_lite SRCS mean_op.cc DEPS ${op_DEPS})
 cc_library(fill_constant_op_lite SRCS fill_constant_op.cc DEPS ${op_DEPS})
 #cc_library(sgd_op_lite SRCS sgd_op.cc DEPS ${op_DEPS})
-
 cc_library(op_params_lite SRCS op_params.cc DEPS ${tensor_lite} any_lite framework_proto_lite)
+cc_library(dropout_op_lite SRCS dropout_op.cc DEPS ${op_DEPS})
+
 set(ops_lite
        fc_op_lite
        relu_op_lite
@@ -27,7 +28,9 @@ set(ops_lite
        elementwise_ops_lite
        mean_op_lite
        fill_constant_op_lite
+        activation_ops_lite
+        dropout_op_lite
        PARENT_SCOPE)

-lite_cc_test(test_fc_op_lite SRCS fc_op_test.cc DEPS fc_op_lite memory_lite)
+lite_cc_test(test_fc_op_lite SRCS fc_op_test.cc DEPS fc_op_lite memory_lite fc_compute_x86)
 lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite memory_lite)
--- a/paddle/fluid/lite/operators/dropout_op.cc
+++ b/paddle/fluid/lite/operators/dropout_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class DropoutOpLite : public OpLite {
+ public:
+  explicit DropoutOpLite(const std::string& type) : OpLite(type) {}
+
+  bool CheckShape() const override {
+    CHECK_OR_FALSE(param_.x);
+    return true;
+  }
+
+  bool InferShape() const override {
+    const auto x_dims = param_.x->dims();
+    param_.output->Resize(x_dims);
+    if (param_.is_test == false) {
+      param_.mask->Resize(x_dims);
+    }
+    // share LoD
+    // param_.output->set_lod(param_.input->lod());
+    return true;
+  }
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+  // TODO(Superjomn) replace framework::OpDesc with a lite one.
+  bool AttachImpl(const OpDesc& op_desc, lite::Scope* scope) override {
+    auto input = op_desc.Input("X").front();
+    auto out = op_desc.Output("Out").front();
+    auto Mask = op_desc.Output("Mask").front();
+
+    param_.x = GetVar<lite::Tensor>(scope, input);
+    param_.output = GetMutableVar<lite::Tensor>(scope, out);
+    param_.mask = GetMutableVar<lite::Tensor>(scope, Mask);
+
+    param_.dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
+    if (op_desc.HasAttr("axis")) {
+      param_.is_test = boost::get<bool>(op_desc.GetAttr("is_test"));
+    }
+    param_.fix_seed = boost::get<bool>(op_desc.GetAttr("fix_seed"));
+    param_.seed = boost::get<int>(op_desc.GetAttr("seed"));
+    param_.dropout_implementation =
+        boost::get<int>(op_desc.GetAttr("dropout_implementation"));
+    return true;
+  }
+
+  std::string DebugString() const override { return "dropout"; }
+
+ private:
+  mutable DropoutParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(dropout, paddle::lite::operators::DropoutOpLite);
--- a/paddle/fluid/lite/operators/elementwise_ops.cc
+++ b/paddle/fluid/lite/operators/elementwise_ops.cc
@@ -43,9 +43,8 @@ class ElementwiseOp : public OpLite {

    param_.X = GetVar<lite::Tensor>(scope, X_name);
    param_.Y = GetVar<lite::Tensor>(scope, Y_name);
-    param_.Out = GetMutableVar<Tensor>(scope, Out_name);
-    param_.axis = GetAttr<int>(opdesc.GetAttr("axis"));
-
+    param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
+    param_.axis = boost::get<int>(opdesc.GetAttr("axis"));
    return true;
  }

@@ -110,3 +109,4 @@ REGISTER_LITE_OP(elementwise_sub, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_sub_grad,
                 paddle::lite::operators::ElementwiseGradExplicitOp);
 #endif
+REGISTER_LITE_OP(elementwise_add, paddle::lite::operators::ElementwiseOp);
--- a/paddle/fluid/lite/operators/fc_op_test.cc
+++ b/paddle/fluid/lite/operators/fc_op_test.cc
@@ -57,10 +57,16 @@ TEST(fc_op_lite, test) {

  FcOpLite fc("fc");

-  fc.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
+  fc.SetValidPlaces({Place{TARGET(kX86), PRECISION(kFloat)}});
  fc.Attach(desc, &scope);
+  auto kernels = fc.CreateKernels({Place{TARGET(kX86), PRECISION(kFloat)}});
+  ASSERT_FALSE(kernels.empty());
 }

 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
+#ifdef LITE_WITH_X86
+
+USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
+#endif
--- a/paddle/fluid/lite/operators/op_params.h
+++ b/paddle/fluid/lite/operators/op_params.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/fluid/lite/core/compatible_tensor.h"
 #include "paddle/fluid/lite/core/framework.pb.h"
@@ -94,14 +95,67 @@ struct ScaleParam {
  bool bias_after_scale{true};
 };

-// For Softmax Op
+// For Softmax op
 struct SoftmaxParam {
  lite::Tensor* x{};
  lite::Tensor* output{};
-
  int axis{-1};
 };

+// For Convolution op
+struct ConvParam {
+  lite::Tensor* x{};
+  lite::Tensor* filter{};
+  lite::Tensor* bias{};
+  lite::Tensor* residualData{};
+  lite::Tensor* output{};
+  std::vector<int> strides{1, 1};
+  std::vector<int> paddings{0, 0};
+  int groups{1};
+  std::vector<int> dilations{1, 1};
+  bool fuse_relu_before_depthwise_conv{false};
+  bool use_mkldnn{false};
+  bool fuse_relu{false};  // only used in mkldnn kernel
+  bool use_quantizer{
+      false};  // set true for op that should be quantized, only used for cpu
+  bool fuse_residual_connection{false};
+  float scale_in{1.0f};           // only used with mkl-dnn int8
+  float scale_out{1.0f};          // only used with mkl-dnn int8
+  float scale_in_eltwise{1.0f};   // only used with mkl-dnn int8
+  float scale_weights{1.0f};      // only used with mkl-dnn int8
+  bool force_fp32_output{false};  // only used in mkl-dnn int8
+  std::string data_format{"Anylayout"};
+};
+
+// For Pooling op
+struct PoolParam {
+  lite::Tensor* x{};
+  lite::Tensor* output{};
+  std::string pooling_type{""};
+  std::vector<int> ksize{};
+  bool global_pooling{
+      false};  // if true, knernel size and paddings will be ignored
+  std::vector<int> strides{1, 1};
+  std::vector<int> paddings{0, 0};
+  bool exclusive{true};
+  bool adaptive{false};
+  bool ceil_mode{false};
+  bool use_quantizer{false};
+  std::string data_format{"AnyLayout"};
+};
+
+// For Dropout op
+struct DropoutParam {
+  const lite::Tensor* x{};
+  lite::Tensor* output{};
+  lite::Tensor* mask{};
+  float dropout_prob{.5f};
+  bool is_test{false};
+  bool fix_seed{false};
+  int seed{0};
+  std::string dropout_implementation{"downgrade_in_infer"};
+};
+
 /// ----------------------- element wise operators ----------------------
 struct ElementwiseParam {
  const lite::Tensor* X{};

--- a/paddle/fluid/lite/operators/relu_op.cc
+++ b/paddle/fluid/lite/operators/relu_op.cc
@@ -25,7 +25,6 @@ bool ReluOp::InferShape() const {
  CHECK_OR_FALSE(param_.output);
  // TODO(Superjomn) Enable data sharing.
  param_.output->Resize(param_.input->dims());
-  // param_.output->ShareDataWith(*param_.input);
  // share lod
  // param_.output->set_lod(param_.input->lod());
  return true;
@@ -42,8 +41,8 @@ bool ReluOp::AttachImpl(const OpDesc &opdesc, lite::Scope *scope) {
  return true;
 }

-REGISTER_LITE_OP(relu, ReluOp);
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
+
+REGISTER_LITE_OP(relu, paddle::lite::operators::ReluOp);
--- a/paddle/fluid/lite/operators/relu_op.h
+++ b/paddle/fluid/lite/operators/relu_op.h
@@ -35,7 +35,7 @@ class ReluOp : public OpLite {
  bool AttachImpl(const OpDesc &opdesc, lite::Scope *scope) override;

  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "tanh"; }
+  std::string DebugString() const override { return "relu"; }

 private:
  mutable ReluParam param_;

--- a/paddle/fluid/lite/utils/varient.h
+++ b/paddle/fluid/lite/utils/varient.h
@@ -128,8 +128,9 @@ struct variant {
    if (type_id == typeid(T).hash_code())
      return reinterpret_cast<T*>(&data);
    else
-      LOG(FATAL) << "unmatched type get, should be " << type_id << " but get "
+      LOG(ERROR) << "unmatched type get, should be " << type_id << " but get "
                 << typeid(T).name();
+    throw std::invalid_argument("unmatched type");
  }
  ~variant() { helper_t::destroy(type_id, &data); }
 };

--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -6,7 +6,8 @@ cc_library(memcpy SRCS memcpy.cc DEPS place)
 cc_library(memory
        DEPS
        malloc
-        memcpy)
+        memcpy
+        )
 #if (WITH_GPU)
 #   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
 #endif()