add some mpc ops and fix some bugs.

383137c8 · jhjiangcs · 6c66b67b · 383137c8 · 383137c8 · 383137c8
83 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.15)
 project(PaddleEncrypted)
-add_compile_options(-msse4.2 -maes -fPIC -DPADDLE_WITH_MKLDNN)
+add_compile_options(-msse4.2 -fPIC -DPADDLE_WITH_MKLDNN -O2)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(CMAKE_CXX_STANDARD 11)
@@ -57,6 +57,10 @@ option(WITH_TESTING "Compile with unit testing" ON)
 option(WITH_PSI "Compile with psi lib" ON)
+option(USE_AES_NI "Compile with AES NI" ON)
+option(USE_OPENMP "Compile with OpenMP" ON)
 ########################### the project build part ###############################
 message(STATUS "Using paddlepaddle installation of ${paddle_version}")
 message(STATUS "paddlepaddle include directory: ${PADDLE_INCLUDE}")
@@ -70,6 +74,15 @@ include_directories(.)
 include_directories(${PADDLE_INCLUDE})
 include_directories(${PADDLE_INCLUDE}/third_party)
+if (USE_AES_NI)
+    add_compile_definitions(USE_AES_NI)
+    add_compile_options(-maes)
+endif (USE_AES_NI)
+if (USE_OPENMP)
+    add_compile_options(-fopenmp)
+    find_package(OpenMP REQUIRED)
+endif(USE_OPENMP)
 add_subdirectory(core/privc3)
 add_subdirectory(core/paddlefl_mpc/mpc_protocol)

--- a/core/paddlefl_mpc/data_utils/CMakeLists.txt
+++ b/core/paddlefl_mpc/data_utils/CMakeLists.txt
-add_compile_options(-msse4.2 -maes)
 set(PYBIND_SRCS
    "./data_utils.cc"
 )
 if (NOT PYTHON_INCLUDE_DIRS)
  find_package(PythonLibs REQUIRED)
 endif()

--- a/core/paddlefl_mpc/data_utils/data_utils.cc
+++ b/core/paddlefl_mpc/data_utils/data_utils.cc
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+<<<<<<< HEAD
-//
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+=======
+>>>>>>> 5a09665c36ffb7eae2288b3f837d3be18091c259
 #include <atomic>
 #include <set>
 #include <string>
@@ -21,8 +24,8 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
-#include "core/paddlefl_mpc/mpc_protocol/aby3_operators.h"
 #include "core/privc3/fixedpoint_util.h"
+#include "core/paddlefl_mpc/mpc_protocol/aby3_operators.h"
 #include "core/psi/psi_api.h"
 namespace py = pybind11;
@@ -30,68 +33,87 @@ namespace py = pybind11;
 namespace aby3 {
 // split plaintext into three shares.
-template <typename T, size_t N> py::array_t<T> share(double input) {
+template<typename T, size_t N>
-  size_t share_num = 3;
+py::array_t<T> share(double input) {
-  auto shares = py::array_t<T>(share_num);
+    size_t share_num = 3;
-  py::buffer_info shares_buf = shares.request();
+    auto shares = py::array_t<T>(share_num);
-  T *shares_buf_ptr = (T *)shares_buf.ptr;
+    py::buffer_info shares_buf = shares.request();
-  T *ret_ptr[share_num];
+    T* shares_buf_ptr = (T*)shares_buf.ptr;
-  for (size_t i = 0; i < share_num; ++i) {
+    T* ret_ptr[share_num];
-    ret_ptr[i] = &shares_buf_ptr[i];
+    for (size_t i = 0; i < share_num; ++i) {
-  }
+        ret_ptr[i] = &shares_buf_ptr[i];
+    }
-  FixedPointUtil<T, N>::share(input, ret_ptr);
+    FixedPointUtil<T, N>::share(input, ret_ptr);
-  return shares;
+    return shares;
 }
 // combine three shares to reveal plaintext.
-template <typename T, size_t N> double reveal(py::array_t<T> shares) {
+template<typename T, size_t N>
-  size_t share_num = 3;
+double reveal(py::array_t<T> shares) {
-  py::buffer_info shares_buf = shares.request();
+    size_t share_num = 3;
-  T *shares_buf_ptr = (T *)shares_buf.ptr;
+    py::buffer_info shares_buf = shares.request();
-  T *ret[share_num];
+    T *shares_buf_ptr = (T *) shares_buf.ptr;
+    T *ret[share_num];
-  for (size_t idx = 0; idx < share_num; ++idx) {
+    for (size_t idx = 0; idx < share_num; ++idx) {
-    ret[idx] = &shares_buf_ptr[idx];
+        ret[idx] = &shares_buf_ptr[idx];
-  }
+    }
-  double result = FixedPointUtil<T, N>::reveal(ret);
+    double result = FixedPointUtil<T, N>::reveal(ret);
-  return result;
+    return result;
 }
 // call psi_send
-int send_psi(int port, const std::set<std::string> &input) {
+int send_psi(int port, const std::set<std::string>& input) {
-  std::atomic<int> prog(0);
+    std::atomic<int> prog(0);
-  return psi::psi_send(port, input, &prog);
+    return psi::psi_send(port, input, &prog);
 }
 // call psi_recv
-std::vector<std::string> recv_psi(const std::string &remote_ip, int port,
+std::vector<std::string> recv_psi(const std::string &remote_ip,
-                                  const std::set<std::string> &input) {
+                                  int port,
-  std::vector<std::string> output;
+                                  const std::set<std::string>& input) {
-  std::atomic<int> prog(0);
+    std::vector<std::string> output;
-  int ret = psi::psi_recv(remote_ip, port, input, &output, &prog);
+    std::atomic<int> prog(0);
-  if (ret != 0) {
+    int ret = psi::psi_recv(remote_ip, port, input, &output, &prog);
-    output.clear();
+    if (ret != 0) {
+        output.clear();
+        return output;
+    }
    return output;
-  }
-  return output;
 }
-PYBIND11_MODULE(mpc_data_utils, m) {
+PYBIND11_MODULE(mpc_data_utils, m)
-  // optional module docstring
+{
-  m.doc() = "pybind11 paddle-mpc plugin: data_utils (share, reveal, psi)";
+    // optional module docstring
+    m.doc() = "pybind11 paddle-mpc plugin: data_utils (share, reveal, psi)";
+<<<<<<< HEAD
+    m.def("share", &share<long long, paddle::mpc::ABY3_SCALING_FACTOR>,
+          "split plaintext into three shares.");
+    m.def("reveal", &reveal<long long, paddle::mpc::ABY3_SCALING_FACTOR>,
+          "combine three shares to reveal plaintext.");
-  m.def("share", &share<long long, paddle::mpc::ABY3_SCALING_FACTOR>,
+    m.def("send_psi", &send_psi, "Send input in two party PSI.");
-        "split plaintext into three shares.");
+    m.def("recv_psi", &recv_psi, "Send input and return PSI result as output in two party PSI.");
-  m.def("reveal", &reveal<long long, paddle::mpc::ABY3_SCALING_FACTOR>,
-        "combine three shares to reveal plaintext.");
-  m.def("send_psi", &send_psi, "Send input in two party PSI.");
+=======
-  m.def("recv_psi", &recv_psi,
-        "Send input and return PSI result as output in two party PSI.");
+    m.def("share", &share<long long, paddle::mpc::ABY3_SCALING_FACTOR>,
+          "split plaintext into three shares.");
+    m.def("reveal", &reveal<long long, paddle::mpc::ABY3_SCALING_FACTOR>,
+          "combine three shares to reveal plaintext.");
+    m.def("send_psi", &send_psi, "Send input in two party PSI.");
+    m.def("recv_psi", &recv_psi, "Send input and return PSI result as output in two party PSI.");
+>>>>>>> 5a09665c36ffb7eae2288b3f837d3be18091c259
+    m.attr("mpc_one_share") = (1 << paddle::mpc::ABY3_SCALING_FACTOR) / 3;
 }
-} // namespace aby3
+}  // namespace aby3
--- a/core/paddlefl_mpc/mpc_protocol/CMakeLists.txt
+++ b/core/paddlefl_mpc/mpc_protocol/CMakeLists.txt
-add_compile_options(-msse4.2 -maes)
 set(PROTO_SRCS
    "./aby3_protocol.cc"
    "./mesh_network.cc"
@@ -17,3 +15,5 @@ target_link_libraries(mpc_protocol fluid_framework gloo hiredis privc3)
 cc_test(mesh_network_test SRCS mesh_network_test.cc DEPS mpc_protocol)
 cc_test(mpc_protocol_test SRCS mpc_protocol_test.cc DEPS mpc_protocol)
 cc_test(mpc_instance_test SRCS mpc_instance_test.cc DEPS mpc_protocol)
--- a/core/paddlefl_mpc/mpc_protocol/aby3_operators.h
+++ b/core/paddlefl_mpc/mpc_protocol/aby3_operators.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-// limitations under the License.
+limitations under the License. */
 // Description: implementations of each virtual op according to ABY3 protocol
@@ -21,9 +21,9 @@
 #include "context_holder.h"
 #include "mpc_operators.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "core/privc3/boolean_tensor.h"
 #include "core/privc3/circuit_context.h"
 #include "core/privc3/fixedpoint_tensor.h"
+#include "core/privc3/boolean_tensor.h"
 #include "core/privc3/paddle_tensor.h"
 namespace paddle {
@@ -32,259 +32,344 @@ namespace mpc {
 using paddle::framework::Tensor;
 using aby3::CircuitContext;
 // TODO: decide scaling factor
-const size_t ABY3_SCALING_FACTOR = 16;
+const size_t ABY3_SCALING_FACTOR = FIXED_POINTER_SCALING_FACTOR;
 using FixedTensor = aby3::FixedPointTensor<int64_t, ABY3_SCALING_FACTOR>;
 using BoolTensor = aby3::BooleanTensor<int64_t>;
 using PaddleTensor = aby3::PaddleTensor<int64_t>;
 class Aby3OperatorsImpl : public MpcOperators {
 public:
-  void add(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
-    auto lhs_tuple = from_tensor(lhs);
+    void add(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
-    auto rhs_tuple = from_tensor(rhs);
-    auto out_tuple = from_tensor(out);
+        auto lhs_tuple = from_tensor(lhs);
+        auto rhs_tuple = from_tensor(rhs);
+        auto out_tuple = from_tensor(out);
+        auto lhs_ = std::get<0>(lhs_tuple).get();
+        auto rhs_ = std::get<0>(rhs_tuple).get();
+        auto out_ = std::get<0>(out_tuple).get();
+        lhs_->add(rhs_, out_);
+    }
+    // TODO: override
+    void sub(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
+        auto lhs_tuple = from_tensor(lhs);
+        auto rhs_tuple = from_tensor(rhs);
+        auto out_tuple = from_tensor(out);
+        auto lhs_ = std::get<0>(lhs_tuple).get();
+        auto rhs_ = std::get<0>(rhs_tuple).get();
+        auto out_ = std::get<0>(out_tuple).get();
+        lhs_->sub(rhs_, out_);
+    }
+    void neg(const Tensor *op, Tensor *out) override {
+        auto op_tuple = from_tensor(op);
+        auto out_tuple = from_tensor(out);
+        auto op_ = std::get<0>(op_tuple).get();
+        auto out_ = std::get<0>(out_tuple).get();
+        op_->negative(out_);
+    }
-    auto lhs_ = std::get<0>(lhs_tuple).get();
+    void sum(const Tensor *op, Tensor *out) override {
-    auto rhs_ = std::get<0>(rhs_tuple).get();
-    auto out_ = std::get<0>(out_tuple).get();
-    lhs_->add(rhs_, out_);
+        auto op_tuple = from_tensor(op);
-  }
+        auto out_tuple = from_tensor(out);
-  // TODO: override
+        auto op_ = std::get<0>(op_tuple).get();
-  void sub(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
+        auto out_ = std::get<0>(out_tuple).get();
-    auto lhs_tuple = from_tensor(lhs);
+        op_->sum(out_);
-    auto rhs_tuple = from_tensor(rhs);
+    }
-    auto out_tuple = from_tensor(out);
-    auto lhs_ = std::get<0>(lhs_tuple).get();
+    void mul(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
-    auto rhs_ = std::get<0>(rhs_tuple).get();
-    auto out_ = std::get<0>(out_tuple).get();
-    lhs_->sub(rhs_, out_);
+        auto lhs_tuple = from_tensor(lhs);
-  }
+        auto rhs_tuple = from_tensor(rhs);
+        auto out_tuple = from_tensor(out);
-  void neg(const Tensor *op, Tensor *out) override {
+        auto lhs_ = std::get<0>(lhs_tuple).get();
+        auto rhs_ = std::get<0>(rhs_tuple).get();
+        auto out_ = std::get<0>(out_tuple).get();
-    auto op_tuple = from_tensor(op);
+        lhs_->mul(rhs_, out_);
-    auto out_tuple = from_tensor(out);
+    }
-    auto op_ = std::get<0>(op_tuple).get();
+    void matmul(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
-    auto out_ = std::get<0>(out_tuple).get();
-    op_->negative(out_);
+        auto lhs_tuple = from_tensor(lhs);
-  }
+        auto rhs_tuple = from_tensor(rhs);
+        auto out_tuple = from_tensor(out);
-  void sum(const Tensor *op, Tensor *out) override {
+        auto lhs_ = std::get<0>(lhs_tuple).get();
+        auto rhs_ = std::get<0>(rhs_tuple).get();
+        auto out_ = std::get<0>(out_tuple).get();
-    auto op_tuple = from_tensor(op);
+        lhs_->mat_mul(rhs_, out_);
-    auto out_tuple = from_tensor(out);
+    }
-    auto op_ = std::get<0>(op_tuple).get();
+    void scale(const Tensor *lhs, const double factor, Tensor *out) override {
-    auto out_ = std::get<0>(out_tuple).get();
+        auto lhs_tuple = from_tensor(lhs);
+        auto out_tuple = from_tensor(out);
-    op_->sum(out_);
+        auto lhs_ = std::get<0>(lhs_tuple).get();
-  }
+        auto out_ = std::get<0>(out_tuple).get();
-  void mul(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
+        PaddleTensor scale_tensor(ContextHolder::device_ctx());
+        scale_tensor.from_float_point_scalar(factor, lhs_->shape(), ABY3_SCALING_FACTOR);
-    auto lhs_tuple = from_tensor(lhs);
+        lhs_->mul(&scale_tensor, out_);
-    auto rhs_tuple = from_tensor(rhs);
+    }
-    auto out_tuple = from_tensor(out);
-    auto lhs_ = std::get<0>(lhs_tuple).get();
+    void relu(const Tensor *op, Tensor *out) override {
-    auto rhs_ = std::get<0>(rhs_tuple).get();
+        auto op_tuple = from_tensor(op);
-    auto out_ = std::get<0>(out_tuple).get();
+        auto out_tuple = from_tensor(out);
-    lhs_->mul(rhs_, out_);
+        auto op_ = std::get<0>(op_tuple).get();
-  }
+        auto out_ = std::get<0>(out_tuple).get();
-  void matmul(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
+        op_->relu(out_);
+    }
-    auto lhs_tuple = from_tensor(lhs);
+    void relu_with_derivative(const Tensor *op, Tensor *out, Tensor *derivative) override {
-    auto rhs_tuple = from_tensor(rhs);
+        auto op_tuple = from_tensor(op);
-    auto out_tuple = from_tensor(out);
+        auto out_tuple = from_tensor(out);
+        auto der_tuple = from_tensor<BoolTensor>(derivative);
-    auto lhs_ = std::get<0>(lhs_tuple).get();
+        auto op_ = std::get<0>(op_tuple).get();
-    auto rhs_ = std::get<0>(rhs_tuple).get();
+        auto out_ = std::get<0>(out_tuple).get();
-    auto out_ = std::get<0>(out_tuple).get();
+        auto der_ = std::get<0>(der_tuple).get();
-    lhs_->mat_mul(rhs_, out_);
+        op_->relu_with_derivative(out_, der_);
-  }
+    }
-  void scale(const Tensor *lhs, const double factor, Tensor *out) override {
+    void sigmoid(const Tensor *op, Tensor *out) override {
-    auto lhs_tuple = from_tensor(lhs);
+        auto op_tuple = from_tensor(op);
-    auto out_tuple = from_tensor(out);
+        auto out_tuple = from_tensor(out);
-    auto lhs_ = std::get<0>(lhs_tuple).get();
+        auto op_ = std::get<0>(op_tuple).get();
-    auto out_ = std::get<0>(out_tuple).get();
+        auto out_ = std::get<0>(out_tuple).get();
-    PaddleTensor scale_tensor(ContextHolder::device_ctx());
+        op_->sigmoid(out_);
-    scale_tensor.from_float_point_scalar(factor, lhs_->shape(),
+    }
-                                         ABY3_SCALING_FACTOR);
-    lhs_->mul(&scale_tensor, out_);
+    void sigmoid_enhanced(const Tensor *op, Tensor *out) override {
-  }
+        auto op_tuple = from_tensor(op);
+        auto out_tuple = from_tensor(out);
-  void relu(const Tensor *op, Tensor *out) override {
+        auto op_ = std::get<0>(op_tuple).get();
-    auto op_tuple = from_tensor(op);
+        auto out_ = std::get<0>(out_tuple).get();
-    auto out_tuple = from_tensor(out);
-    auto op_ = std::get<0>(op_tuple).get();
+        op_->sigmoid_enhanced(out_);
-    auto out_ = std::get<0>(out_tuple).get();
+    }
-    op_->relu(out_);
+    void sigmoid_chebyshev(const Tensor *op, Tensor *out) override {
-  }
+        auto op_tuple = from_tensor(op);
+        auto out_tuple = from_tensor(out);
-  void sigmoid(const Tensor *op, Tensor *out) override {
+        auto op_ = std::get<0>(op_tuple).get();
-    auto op_tuple = from_tensor(op);
+        auto out_ = std::get<0>(out_tuple).get();
-    auto out_tuple = from_tensor(out);
-    auto op_ = std::get<0>(op_tuple).get();
+        op_->sigmoid_chebyshev(out_);
-    auto out_ = std::get<0>(out_tuple).get();
+    }
-    op_->sigmoid(out_);
+    void softmax(const Tensor *op, Tensor *out, bool use_relu, bool use_long_div) override {
-  }
+        auto op_tuple = from_tensor(op);
+        auto out_tuple = from_tensor(out);
-  void softmax(const Tensor *op, Tensor *out) override {
+        auto op_ = std::get<0>(op_tuple).get();
-    auto op_tuple = from_tensor(op);
+        auto out_ = std::get<0>(out_tuple).get();
-    auto out_tuple = from_tensor(out);
-    auto op_ = std::get<0>(op_tuple).get();
+        op_->softmax(out_, use_relu, use_long_div);
-    auto out_ = std::get<0>(out_tuple).get();
+    }
-    op_->softmax(out_);
+    void gt(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
-  }
-  void gt(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
+        auto lhs_tuple = from_tensor(lhs);
-    auto lhs_tuple = from_tensor(lhs);
+        auto lhs_ = std::get<0>(lhs_tuple).get();
-    auto lhs_ = std::get<0>(lhs_tuple).get();
+        PaddleTensor rhs_(ContextHolder::device_ctx());
+        rhs_.from_float_point_type<float>(*rhs, ABY3_SCALING_FACTOR);
-    PaddleTensor rhs_(ContextHolder::device_ctx());
+        PaddleTensor out_(ContextHolder::device_ctx(), *out);
-    rhs_.from_float_point_type<float>(*rhs, ABY3_SCALING_FACTOR);
-    PaddleTensor out_(ContextHolder::device_ctx(), *out);
+        auto tmp0 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
+        auto tmp1 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
-    auto tmp0 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
+        BoolTensor bool_out(tmp0.get(), tmp1.get());
-    auto tmp1 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
-    BoolTensor bool_out(tmp0.get(), tmp1.get());
+        lhs_->gt(&rhs_, &bool_out);
-    lhs_->gt(&rhs_, &bool_out);
+        bool_out.reveal(&out_);
+    }
-    bool_out.reveal(&out_);
+    void geq(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
-  }
+        lt(lhs, rhs, out);
+        std::transform(out->data<int64_t>(), out->data<int64_t>() + out->numel(),
+                       out->data<int64_t>(), [](int64_t b) { return 1 - b; });
+    }
-  void geq(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
+    void lt(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
-    lt(lhs, rhs, out);
-    std::transform(out->data<int64_t>(), out->data<int64_t>() + out->numel(),
-                   out->data<int64_t>(), [](int64_t b) { return 1 - b; });
-  }
-  void lt(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
+        auto lhs_tuple = from_tensor(lhs);
-    auto lhs_tuple = from_tensor(lhs);
+        auto lhs_ = std::get<0>(lhs_tuple).get();
-    auto lhs_ = std::get<0>(lhs_tuple).get();
+        PaddleTensor rhs_(ContextHolder::device_ctx(), *rhs);
+        rhs_.from_float_point_type<float>(*rhs, ABY3_SCALING_FACTOR);
-    PaddleTensor rhs_(ContextHolder::device_ctx(), *rhs);
+        PaddleTensor out_(ContextHolder::device_ctx(), *out);
-    rhs_.from_float_point_type<float>(*rhs, ABY3_SCALING_FACTOR);
-    PaddleTensor out_(ContextHolder::device_ctx(), *out);
+        auto tmp0 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
+        auto tmp1 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
-    auto tmp0 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
+        BoolTensor bool_out(tmp0.get(), tmp1.get());
-    auto tmp1 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
-    BoolTensor bool_out(tmp0.get(), tmp1.get());
+        lhs_->lt(&rhs_, &bool_out);
-    lhs_->lt(&rhs_, &bool_out);
+        bool_out.reveal(&out_);
+    }
-    bool_out.reveal(&out_);
+    void leq(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
-  }
+        gt(lhs, rhs, out);
+        std::transform(out->data<int64_t>(), out->data<int64_t>() + out->numel(),
+                       out->data<int64_t>(), [](int64_t b) { return 1 - b; });
+    }
-  void leq(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
+    void eq(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
-    gt(lhs, rhs, out);
-    std::transform(out->data<int64_t>(), out->data<int64_t>() + out->numel(),
-                   out->data<int64_t>(), [](int64_t b) { return 1 - b; });
-  }
-  void eq(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
+        auto lhs_tuple = from_tensor(lhs);
-    auto lhs_tuple = from_tensor(lhs);
+        auto lhs_ = std::get<0>(lhs_tuple).get();
-    auto lhs_ = std::get<0>(lhs_tuple).get();
+        PaddleTensor rhs_(ContextHolder::device_ctx(), *rhs);
+        rhs_.from_float_point_type<float>(*rhs, ABY3_SCALING_FACTOR);
-    PaddleTensor rhs_(ContextHolder::device_ctx(), *rhs);
+        PaddleTensor out_(ContextHolder::device_ctx(), *out);
-    rhs_.from_float_point_type<float>(*rhs, ABY3_SCALING_FACTOR);
-    PaddleTensor out_(ContextHolder::device_ctx(), *out);
+        auto tmp0 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
+        auto tmp1 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
-    auto tmp0 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
+        BoolTensor bool_out(tmp0.get(), tmp1.get());
-    auto tmp1 = ContextHolder::tensor_factory()->create_int64_t(rhs_.shape());
-    BoolTensor bool_out(tmp0.get(), tmp1.get());
+        lhs_->eq(&rhs_, &bool_out);
-    lhs_->eq(&rhs_, &bool_out);
+        bool_out.reveal(&out_);
+    }
-    bool_out.reveal(&out_);
+    void neq(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
-  }
+        eq(lhs, rhs, out);
+        std::transform(out->data<int64_t>(), out->data<int64_t>() + out->numel(),
+                       out->data<int64_t>(), [](int64_t b) { return 1 - b; });
+    }
-  void neq(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
+    void relu_grad(const Tensor *y, const Tensor *dy,
-    eq(lhs, rhs, out);
+                   Tensor *dx, float point = 0.0f) override {
-    std::transform(out->data<int64_t>(), out->data<int64_t>() + out->numel(),
-                   out->data<int64_t>(), [](int64_t b) { return 1 - b; });
-  }
-  void relu_grad(const Tensor *y, const Tensor *dy, Tensor *dx,
+        auto y_tuple = from_tensor(y);
-                 float point = 0.0f) override {
-    auto y_tuple = from_tensor(y);
+        auto y_ = std::get<0>(y_tuple).get();
-    auto y_ = std::get<0>(y_tuple).get();
+        PaddleTensor point_(ContextHolder::device_ctx());
-    PaddleTensor point_(ContextHolder::device_ctx());
+        point_.from_float_point_scalar<float>(point, y_->shape(), ABY3_SCALING_FACTOR);
-    point_.from_float_point_scalar<float>(point, y_->shape(),
+        auto tmp0 = ContextHolder::tensor_factory()->create_int64_t(y_->shape());
-                                          ABY3_SCALING_FACTOR);
+        auto tmp1 = ContextHolder::tensor_factory()->create_int64_t(y_->shape());
-    auto tmp0 = ContextHolder::tensor_factory()->create_int64_t(y_->shape());
+        BoolTensor bool_out(tmp0.get(), tmp1.get());
-    auto tmp1 = ContextHolder::tensor_factory()->create_int64_t(y_->shape());
-    BoolTensor bool_out(tmp0.get(), tmp1.get());
+        y_->gt(&point_, &bool_out);
-    y_->gt(&point_, &bool_out);
+        auto out_tuple = from_tensor(dx);
+        auto out_ = std::get<0>(out_tuple).get();
-    auto out_tuple = from_tensor(dx);
+        auto dy_tuple = from_tensor(dy);
-    auto out_ = std::get<0>(out_tuple).get();
+        auto dy_ = std::get<0>(dy_tuple).get();
-    auto dy_tuple = from_tensor(dy);
+        bool_out.mul(dy_, out_);
-    auto dy_ = std::get<0>(dy_tuple).get();
+    }
-    bool_out.mul(dy_, out_);
+    void arith_bool_mul(const Tensor* op_a, const Tensor* op_b, Tensor* out) override {
-  }
+        auto a_tuple = from_tensor(op_a);
+        auto a_ = std::get<0>(a_tuple).get();
+        auto b_tuple = from_tensor<BoolTensor>(op_b);
+        auto b_ = std::get<0>(b_tuple).get();
+        auto out_tuple = from_tensor(out);
+        auto out_ = std::get<0>(out_tuple).get();
+        b_->mul(a_, out_);
+    }
+    void max_pooling(const Tensor* in, Tensor* out, Tensor* pos_info) override {
+        auto a_tuple = from_tensor(in);
+        auto a_ = std::get<0>(a_tuple).get();
+        auto b_tuple = from_tensor<BoolTensor>(pos_info);
+        auto b_ = std::get<0>(b_tuple).get();
+        auto out_tuple = from_tensor(out);
+        auto out_ = std::get<0>(out_tuple).get();
+        a_->max_pooling(out_, b_);
+    }
+    void inverse_square_root(const Tensor* in, Tensor* out) override {
+        auto x_tuple = from_tensor(in);
+        auto x_ = std::get<0>(x_tuple).get();
+        auto y_tuple = from_tensor(out);
+        auto y_ = std::get<0>(y_tuple).get();
+        x_->inverse_square_root(y_);
+    }
 private:
-  std::tuple<std::shared_ptr<FixedTensor>, std::shared_ptr<PaddleTensor>,
+    template <typename T>
-             std::shared_ptr<PaddleTensor>>
+    std::tuple<
-  from_tensor(const Tensor *t) {
+        std::shared_ptr<T>,
+        std::shared_ptr<PaddleTensor>,
+        std::shared_ptr<PaddleTensor> > from_tensor(const Tensor* t) {
+            PADDLE_ENFORCE_EQ(t->dims()[0], 2);
+            auto pt0 = std::make_shared<PaddleTensor>(ContextHolder::device_ctx(), t->Slice(0, 1));
+            auto pt1 = std::make_shared<PaddleTensor>(ContextHolder::device_ctx(), t->Slice(1, 2));
+           // remove leading 1 in shape
+           auto shape = pt0->shape();
+           shape.erase(shape.begin());
+           pt0->reshape(shape);
+           pt1->reshape(shape);
+            aby3::TensorAdapter<int64_t>* pt_array[2] = {pt0.get(), pt1.get()};
-    PADDLE_ENFORCE_EQ(t->dims()[0], 2);
+            auto ft = std::make_shared<T>(pt_array);
-    auto pt0 = std::make_shared<PaddleTensor>(ContextHolder::device_ctx(),
+        return std::make_tuple(ft, pt0, pt1);
-                                              t->Slice(0, 1));
+    }
-    auto pt1 = std::make_shared<PaddleTensor>(ContextHolder::device_ctx(),
-                                              t->Slice(1, 2));
-    aby3::TensorAdapter<int64_t> *pt_array[2] = {pt0.get(), pt1.get()};
+    std::tuple<
+        std::shared_ptr<FixedTensor>,
+        std::shared_ptr<PaddleTensor>,
+        std::shared_ptr<PaddleTensor> > from_tensor(const Tensor* t) {
-    auto ft = std::make_shared<FixedTensor>(pt_array);
+        return from_tensor<FixedTensor>(t);
+    }
-    return std::make_tuple(ft, pt0, pt1);
-  }
 };
 } // mpc

--- a/core/paddlefl_mpc/mpc_protocol/mpc_operators.h
+++ b/core/paddlefl_mpc/mpc_protocol/mpc_operators.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-// limitations under the License.
+limitations under the License. */
 // Description:
 // abstract mpc operation interface
@@ -24,43 +24,67 @@ namespace mpc {
 using paddle::framework::Tensor;
+// TODO: decide scaling factor
+const size_t FIXED_POINTER_SCALING_FACTOR = 16;
 class MpcOperators {
 public:
-  virtual void add(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void add(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void sub(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void neg(const Tensor *op, Tensor *out) = 0;
-  virtual void sub(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void sum(const Tensor *op, Tensor *out) = 0;
-  virtual void neg(const Tensor *op, Tensor *out) = 0;
+    virtual void mul(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
-  virtual void sum(const Tensor *op, Tensor *out) = 0;
+    virtual void matmul(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
-  virtual void mul(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void scale(const Tensor *lhs, const double factor, Tensor *out) = 0;
-  virtual void matmul(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void relu(const Tensor *op, Tensor *out) = 0;
-  virtual void scale(const Tensor *lhs, const double factor, Tensor *out) = 0;
+    virtual void relu_with_derivative(const Tensor *op, Tensor *out,
+                                      Tensor *derivative) = 0;
-  virtual void relu(const Tensor *op, Tensor *out) = 0;
+    virtual void sigmoid(const Tensor *op, Tensor *out) = 0;
-  virtual void sigmoid(const Tensor *op, Tensor *out) = 0;
+    virtual void sigmoid_enhanced(const Tensor *op, Tensor *out) = 0;
-  virtual void softmax(const Tensor *op, Tensor *out) = 0;
+    virtual void sigmoid_chebyshev(const Tensor *op, Tensor *out) = 0;
-  virtual void gt(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void softmax(const Tensor *op, Tensor *out, bool use_relu, bool use_long_div) = 0;
-  virtual void geq(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void gt(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
-  virtual void lt(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void geq(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
-  virtual void leq(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void lt(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
-  virtual void eq(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void leq(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
-  virtual void neq(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
+    virtual void eq(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
-  virtual void relu_grad(const Tensor *y, const Tensor *dy, Tensor *dx,
+    virtual void neq(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
-                         const float point) = 0;
+    virtual void relu_grad(const Tensor *y, const Tensor *dy, Tensor *dx, const float point) = 0;
+    // arithmetic tensor mult boolean tensor, element-wisely
+    // see [ABY3, sec 5.4.1]
+    // for aby3 only
+    // example (in plaintext):
+    // [1, 2, 3, 4] * [0, 0, 1, 0] = [0, 0, 3, 0]
+    virtual void arith_bool_mul(const Tensor* op_a, const Tensor* op_b, Tensor* out) {}
+    // max pooling in which shape of filter is nx1
+    // pos_info keeps which element is max in a col, for backward grad
+    // for filter in other shape, reshape input first
+    virtual void max_pooling(const Tensor* in, Tensor* out, Tensor* pos_info) {}
+    virtual void inverse_square_root(const Tensor* in, Tensor* out) = 0;
 };
 } // mpc
 } // paddle
--- a/core/paddlefl_mpc/operators/CMakeLists.txt
+++ b/core/paddlefl_mpc/operators/CMakeLists.txt
-add_compile_options(-msse4.2 -maes)
 aux_source_directory(. DIR_SRCS)
-add_library(mpc_ops_o OBJECT ${DIR_SRCS})
+aux_source_directory(./math MATH_SRCS)
+add_library(mpc_ops_o OBJECT ${DIR_SRCS} ${MATH_SRCS})
 add_dependencies(mpc_ops_o fluid_framework gloo)
 add_library(mpc_ops STATIC $<TARGET_OBJECTS:mpc_ops_o>)

--- a/core/paddlefl_mpc/operators/conv_op.cc
+++ b/core/paddlefl_mpc/operators/conv_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "./conv_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+namespace paddle {
+namespace operators {
+std::vector<int64_t> ConvOp::ComputeOutputShape(
+    framework::InferShapeContext* ctx) const {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Conv");
+    OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "Conv");
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::string padding_algorithm =
+        ctx->Attrs().Get<std::string>("padding_algorithm");
+    int groups = ctx->Attrs().Get<int>("groups");
+    std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+    const std::string data_format = ctx->Attrs().Get<std::string>("data_format");
+    // MKL-DNN Kernels are using NCHW order of dims description
+    // so we ignore data_format consideration for MKL-DNN kernel
+    const bool channel_last = (this->IsMKLDNNType() == false) &&
+        (data_format == "NHWC" || data_format == "NDHWC");
+    PADDLE_ENFORCE_EQ(
+        // 1 for share dim
+        in_dims.size() == 4 + 1 || in_dims.size() == 5 + 1, true,
+        platform::errors::InvalidArgument(
+            "The input of Op(Conv) should be a 4-D or 5-D Tensor. But "
+            "received: input's dimension is %u, input's shape is [%s].",
+            in_dims.size(), in_dims));
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), filter_dims.size(),
+        platform::errors::InvalidArgument(
+            "The input's dimension and filter's dimension of "
+            "Op(Conv) should be equal. But received: the input's shape is [%s], "
+            "the input's dimension is %d; the filter's shape is [%s],  "
+            "the filter's dimension is %d.",
+            in_dims, in_dims.size(), filter_dims, filter_dims.size()));
+    int in_sub_stride_size = in_dims.size() - strides.size();
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), strides.size() + 2U + 1,
+        platform::errors::InvalidArgument(
+            "The difference of input's dimension and Attr(strides)'s "
+            "length must be euqal to 2 for Op(Conv). "
+            "But received: input's dimension is %d, input's shape is [%s]; "
+            "Attr(stride)'s length is %d, Attr(stride) is [%s]; "
+            "difference of input's dimention and Attr(strides)'s length = %u.",
+            in_dims.size(), in_dims, strides.size(),
+            framework::make_ddim(strides), in_sub_stride_size));
+    const auto input_channels =
+        channel_last ? in_dims[in_dims.size() - 1] : in_dims[1 + 1];
+    PADDLE_ENFORCE_EQ(
+        input_channels, filter_dims[1 + 1] * groups,
+        platform::errors::InvalidArgument(
+            "The number of input's channels should be equal to filter's channels "
+            "* groups for Op(Conv). But received: the input's channels is %d, "
+            "the input's shape is [%s]; the filter's channels is %d, the "
+            "filter's shape is [%s]; the groups is %d, the data_format is %s. "
+            "The error may come from wrong data_format setting.",
+            input_channels, in_dims, filter_dims[1 + 1], filter_dims, groups,
+            data_format));
+    PADDLE_ENFORCE_EQ(
+        filter_dims[0 + 1] % groups, 0,
+        platform::errors::InvalidArgument(
+            "The number of output's channels (filter's first dimension) of "
+            "Op(Conv) should be divided by groups. But received: "
+            "the output channels is %d, the filter's shape is [%s], "
+            "the groups is %d.",
+            filter_dims[0 + 1], filter_dims, groups));
+    framework::DDim in_data_dims;
+    if (channel_last) {
+        in_data_dims = framework::slice_ddim(in_dims, 1 + 1, in_dims.size() - 1);
+    } else {
+        in_data_dims = framework::slice_ddim(in_dims, 2 + 1, in_dims.size());
+    }
+    framework::DDim filter_data_dims =
+        framework::slice_ddim(filter_dims, 2 + 1, filter_dims.size());
+    std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+    std::vector<int64_t> output_shape({in_dims[0], in_dims[1]});
+    if (!channel_last) {
+        output_shape.push_back(filter_dims[0 + 1]);
+    }
+    for (int i = 0; i < in_data_dims.size(); ++i) {
+        if ((!ctx->IsRuntime()) &&
+            (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) {
+            output_shape.push_back(-1);
+        } else {
+            output_shape.push_back(
+                ConvOutputSize(in_data_dims[i], filter_data_dims[i], dilations[i],
+                               paddings[2 * i], paddings[2 * i + 1], strides[i]));
+        }
+    }
+    if (channel_last) {
+        output_shape.push_back(filter_dims[1]);
+    }
+    return output_shape;
+}
+framework::OpKernelType ConvOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+    int customized_type_value =
+        framework::OpKernelType::kDefaultCustomizedTypeValue;
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input");
+    std::string data_format =
+        "AnyLayout";  // todo enable data layout when it's ready
+    framework::DataLayout layout = framework::StringToDataLayout(data_format);
+    if (input_data_type != framework::proto::VarType::INT8 &&
+        input_data_type != framework::proto::VarType::UINT8) {
+        auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
+        PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
+                          platform::errors::InvalidArgument(
+                              "input and filter data type should be consistent"));
+    }
+    if (input_data_type == framework::proto::VarType::FP16) {
+        PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
+                          platform::errors::InvalidArgument(
+                              "float16 can only be used when CUDNN is used"));
+    }
+    auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                        library, customized_type_value);
+    return type;
+}
+framework::OpKernelType ConvOp::GetKernelTypeForVar(
+    const std::string& var_name, const Tensor& tensor,
+    const framework::OpKernelType& expected_kernel_type) const {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+}
+void Conv2DOpMaker::Make() {
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddInput("Input",
+             "(Tensor) The input tensor of convolution operator. "
+             "The format of input tensor is NCHW or NHWC, where N is batch size, "
+             "C is the "
+             "number of channels, H is the height of the feature, "
+             "and W is the width of the feature.");
+    AddInput("Filter",
+             "(Tensor) The filter tensor of convolution operator. "
+             "The format of the filter tensor is MCHW, where M is the number of "
+             "output image channels, C is the number of input image channels, "
+             "H is the height of the filter, and W is the width of the filter. "
+             "If the groups attribute is greater than 1, C equals the number of "
+             "input image channels divided by the groups.");
+    AddInput("Bias",
+             "(Tensor) Bias to be added to each output of filter application."
+             "The format of output tensor is X (one-dimensional) of size equal"
+             "to the number of output channels. Only used with MKL-DNN.")
+        .AsDispensable();
+    AddOutput("Output",
+              "(Tensor) The output tensor of convolution operator. "
+              "It has same data fromat and data type as the Input.");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int> default:{1, 1}), the "
+                              "strides(h_stride, w_stride) of "
+                              "convolution operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector<int> default:{0, 0}), the "
+                              "paddings(pad_height_top, pad_height_bottom, "
+                              "pad_width_left, pad_wifth_right)  of "
+                              "convolution operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>(
+        "padding_algorithm",
+        "(string, default \"EXPLICIT\") An optional string from: \"EXPLICIT\","
+        "\"SAME\",\"VALID\". Set to \"EXPLICIT\" for explicit padding. "
+        "Set to \"SAME\" or \"VALID\" for algorithm of padding. ")
+        .SetDefault("EXPLICIT");
+    AddAttr<int>(
+        "groups",
+        "(int default:1), the groups number of the convolution operator. "
+        "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
+        "when group=2, the first half of the filters is only connected to the "
+        "first half of the input channels, while the second half of the filters "
+        "is only connected to the second half of the input channels.")
+        .SetDefault(1);
+    AddAttr<std::vector<int>>("dilations",
+                              "(vector<int> default:{1, 1}), the "
+                              "dilations(h_dilation, w_dilation) of "
+                              "convolution operator.")
+        .SetDefault({1, 1});
+    AddAttr<bool>("use_quantizer",
+                  "(bool, default false) "
+                  "Set to true for operators that should be quantized and use "
+                  "int8 kernel. "
+                  "Only used on CPU.")
+        .SetDefault(false);
+    AddAttr<float>("Scale_in",
+                   "Scale_in to be used for int8 input data."
+                   "Only used with MKL-DNN INT8.")
+        .SetDefault(1.0f);
+    AddAttr<float>("Scale_out",
+                   "Scale_out to be used for int8 output data."
+                   "Only used with MKL-DNN INT8.")
+        .SetDefault(1.0f);
+    AddAttr<float>("Scale_in_eltwise",
+                   "Scale_in_eltwise to be used for int8 eltwise input data."
+                   "Only used with MKL-DNN INT8.")
+        .SetDefault(1.0f);
+    AddAttr<std::vector<float>>("Scale_weights",
+                                "Scale_weights to be used for int8 weights data."
+                                "Only used with MKL-DNN INT8.")
+        .SetDefault({1.0f});
+    AddAttr<bool>("force_fp32_output",
+                  "(bool, default false) Force INT8 kernel output FP32, only "
+                  "used in MKL-DNN INT8")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("NCHW");
+    // TODO(dzhwinter): need to registered layout transform function
+    AddAttr<bool>("exhaustive_search",
+                  "(bool, default false) cuDNN has many algorithm to calculation "
+                  "convolution, whether enable exhaustive search "
+                  "for cuDNN convolution or not, default is False.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Convolution Operator.
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and Output(Output) are in NCHW or NHWC format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and W is
+the width of the feature.
+Filters(Input) is MCHW format format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
+The input(X) size and output(Out) size may be different.
+Example:
+  Input:
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+$$
+       H_{out}= \frac{(H_{in} + pad_height_top + pad_height_bottom - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+       W_{out}= \frac{(W_{in} + pad_width_left + pad_width_right - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
+)DOC");
+        Apply();
+}
+void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    if (ctx->HasOutput(framework::GradVarName("Input"))) {
+        ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+        ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+    }
+}
+framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+    int customized_type_value =
+        framework::OpKernelType::kDefaultCustomizedTypeValue;
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    std::string data_format = "AnyLayout";
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+    auto type = framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace(),
+        layout_, library_, customized_type_value);
+    return type;
+}
+framework::OpKernelType ConvOpGrad::GetKernelTypeForVar(
+    const std::string& var_name, const Tensor& tensor,
+    const framework::OpKernelType& expected_kernel_type) const {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+}
+template <typename T>
+class Conv2DGradMaker : public framework::SingleGradOpMaker<T> {
+public:
+    using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+    void Apply(GradOpPtr<T> op) const override {
+        op->SetType(this->ForwardOpType() + "_grad");
+        op->SetInput("Input", this->Input("Input"));
+        op->SetInput("Filter", this->Input("Filter"));
+        op->SetInput("Bias", this->Input("Bias"));
+        op->SetInput(framework::GradVarName("Output"), this->OutputGrad("Output"));
+        op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
+        op->SetOutput(framework::GradVarName("Filter"), this->InputGrad("Filter"));
+        op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+        op->SetAttrMap(this->Attrs());
+    }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mpc_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  ops::ConvOpInferVarType,
+                  ops::Conv2DGradMaker<paddle::framework::OpDesc>,
+                  ops::Conv2DGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(mpc_conv2d_grad, ops::ConvOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    mpc_conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    mpc_conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/core/paddlefl_mpc/operators/conv_op.h
+++ b/core/paddlefl_mpc/operators/conv_op.h
--- a/core/paddlefl_mpc/operators/math/concat_and_split.cc
+++ b/core/paddlefl_mpc/operators/math/concat_and_split.cc
+/* Copyright (c) 2020 paddlepaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "core/paddlefl_mpc/operators/math/concat_and_split.h"
+#include <vector>
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatFunctor<platform::CPUDeviceContext, T> {
+public:
+    void operator()(const platform::CPUDeviceContext& context,
+                    const std::vector<framework::Tensor>& input, int axis,
+                    framework::Tensor* output) {
+        // TODO(zcd): Add input data validity checking
+        int num = input.size();
+        int rows = 1;
+        auto dim_0 = input[0].dims();
+        for (int i = 0; i < axis; ++i) {
+            rows *= dim_0[i];
+        }
+        int out_rows = rows, out_cols = 0;
+        std::vector<int64_t> input_cols(input.size());
+        for (int i = 0; i < num; ++i) {
+            int t_cols = input[i].numel() / rows;
+            out_cols += t_cols;
+            input_cols[i] = t_cols;
+        }
+        auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+        // computation
+        auto output_data = output->data<T>();
+        int col_idx = 0;
+        for (int j = 0; j < num; ++j) {
+            int col_len = input_cols[j];
+            auto input_data = input[j].data<T>();
+            for (int k = 0; k < out_rows; ++k) {
+                memory::Copy(cpu_place, output_data + k * out_cols + col_idx, cpu_place,
+                             input_data + k * col_len, sizeof(T) * col_len);
+            }
+            col_idx += col_len;
+        }
+    }
+};
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class SplitFunctor<platform::CPUDeviceContext, T> {
+public:
+    void operator()(const platform::CPUDeviceContext& context,
+                    const framework::Tensor& input,
+                    const std::vector<const framework::Tensor*>& ref_inputs,
+                    const int axis, std::vector<framework::Tensor*>* outputs) {
+        // TODO(zcd): Add input data validity checking
+        size_t num = outputs->size();
+        int input_rows = 1;
+        auto dim_0 = ref_inputs[0]->dims();
+        for (int i = 0; i < axis; ++i) {
+            input_rows *= dim_0[i];
+        }
+        int input_cols = 0;
+        std::vector<int64_t> output_cols(outputs->size());
+        for (size_t i = 0; i < num; ++i) {
+            int t_cols = ref_inputs[i]->numel() / input_rows;
+            input_cols += t_cols;
+            output_cols[i] = t_cols;
+        }
+        auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+        // computation
+        for (int k = 0; k < input_rows; ++k) {
+            const T* src_ptr = input.data<T>() + k * input_cols;
+            int col_idx = 0;
+            for (size_t j = 0; j < num; ++j) {
+                int col_len = output_cols[j];
+                auto* out_tensor = outputs->at(j);
+                if (out_tensor != nullptr) {
+                    T* dst_ptr = out_tensor->data<T>() + k * col_len;
+                    memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+                                 sizeof(T) * col_len);
+                }
+                col_idx += col_len;
+            }
+        }
+    }
+};
+#define DEFINE_FUNCTOR(type)                                      \
+  template class ConcatFunctor<platform::CPUDeviceContext, type>; \
+  template class SplitFunctor<platform::CPUDeviceContext, type>;
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/concat_and_split.h
+++ b/core/paddlefl_mpc/operators/math/concat_and_split.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
+ *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
+ */
+template <typename DeviceContext, typename T>
+class ConcatFunctor {
+public:
+    void operator()(const DeviceContext& context,
+                    const std::vector<framework::Tensor>& input, int axis,
+                    framework::Tensor* output);
+};
+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
+template <typename DeviceContext, typename T>
+class SplitFunctor {
+public:
+    void operator()(const DeviceContext& context, const framework::Tensor& input,
+                    const std::vector<const framework::Tensor*>& ref_inputs,
+                    int axis, std::vector<framework::Tensor*>* outputs);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
+#define FOR_ALL_TYPES(macro) \
+  macro(int64_t);            \
--- a/core/paddlefl_mpc/operators/math/im2col.cc
+++ b/core/paddlefl_mpc/operators/math/im2col.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "./im2col.h"
+#include <vector>
+#include "./im2col_cfo_cpu.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col,
+                  const DataLayout data_layout) {
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(col->dims().size(), 5,
+                      "The dimension of col should be 5.");
+    if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
+        dilation[1] == 1) {
+      if (padding[0] == 0 && padding[1] == 0 && padding[2] == 0 &&
+          padding[3] == 0) {
+        im2col_sh1sw1dh1dw1ph0pw0<T>(im, col, data_layout);
+        return;
+      } else if (padding[0] == 1 && padding[1] == 1 && padding[2] == 1 &&
+                 padding[3] == 1) {
+        im2col_sh1sw1dh1dw1ph1pw1<T>(im, col, data_layout);
+        return;
+      }
+      // TODO(TJ): complete padding >=2
+    }
+    im2col_common<T>(im, dilation, stride, padding, col, data_layout);
+  }
+};
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im,
+                  const DataLayout data_layout) {
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(col.dims().size(), 5,
+                      "The dimension of col should be 5.");
+    int im_channels =
+        (data_layout != DataLayout::kNHWC ? im->dims()[0] : im->dims()[2]);
+    int im_height =
+        (data_layout != DataLayout::kNHWC ? im->dims()[1] : im->dims()[0]);
+    int im_width =
+        (data_layout != DataLayout::kNHWC ? im->dims()[2] : im->dims()[1]);
+    int filter_height = col.dims()[1];
+    int filter_width = col.dims()[2];
+    int col_height = col.dims()[3];
+    int col_width = col.dims()[4];
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       ((dilation[0] * (filter_height - 1) + 1))) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       ((dilation[1] * (filter_width - 1) + 1))) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    int channels_col = im_channels * filter_height * filter_width;
+    T* im_data = im->data<T>();
+    const T* col_data = col.data<T>();
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / (filter_width * filter_height);
+      for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
+              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
+            int im_offset;
+            if (data_layout != DataLayout::kNHWC) {
+              im_offset =
+                  (c_im * im_height + im_row_idx) * im_width + im_col_idx;
+            } else {
+              im_offset =
+                  (im_row_idx * im_width + im_col_idx) * im_channels + c_im;
+            }
+            im_data[im_offset] +=
+                col_data[(c * col_height + h) * col_width + w];
+          }
+        }
+      }
+    }
+  }
+};
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, int64_t>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, int64_t>;
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col,
+                  const DataLayout data_layout) {
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(col->dims().size(), 5,
+                      "The dimension of col should be 5.");
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[3];
+    int filter_width = col->dims()[4];
+    int col_height = col->dims()[0];
+    int col_width = col->dims()[1];
+    const T* im_data = im.data<T>();
+    T* col_data = col->data<T>();
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_col_offset =
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+              int col_offset =
+                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+              int im_offset = (channel * im_height + im_row_offset) * im_width +
+                              im_col_offset;
+              col_data[col_offset] =
+                  (im_row_offset < 0 || im_row_offset >= im_height ||
+                   im_col_offset < 0 || im_col_offset >= im_width)
+                      ? static_cast<T>(0)
+                      : im_data[im_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im,
+                  const DataLayout data_layout) {
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(col.dims().size(), 5,
+                      "The dimension of col should be 5.");
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[3];
+    int filter_width = col.dims()[4];
+    int col_height = col.dims()[0];
+    int col_width = col.dims()[1];
+    PADDLE_ENFORCE_EQ(
+        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
+        col_height,
+        "Output_height and padding(padding_up, padding_down) are "
+        "inconsistent.");
+    PADDLE_ENFORCE_EQ(
+        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
+        col_width,
+        "col_width and padding(padding_left, padding_right) are "
+        "inconsistent.");
+    T* im_data = im->data<T>();
+    const T* col_data = col.data<T>();
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_col_offset =
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+              int col_offset =
+                  (((col_row_idx * col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+              if (im_row_offset >= 0 && im_row_offset < im_height &&
+                  im_col_offset >= 0 && im_col_offset < im_width) {
+                int im_offset =
+                    (channel * im_height + im_row_offset) * im_width +
+                    im_col_offset;
+                im_data[im_offset] += col_data[col_offset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, int64_t>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, int64_t>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/im2col.h
+++ b/core/paddlefl_mpc/operators/math/im2col.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace operators {
+namespace math {
+using DataLayout = framework::DataLayout;
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum class ColFormat { kCFO = 0, kOCF = 1 };
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [input_channels, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * \param dilations    dilation data.
+ * \param 2-dimension  [dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 2-dimension  [stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [input_channels, filter_height, filter_width, output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_height * filter_width, and the width is equal
+ * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_height,
+ *      output_width]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [output_height, output_width, input_channels, filter_height, filter_width]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seq_length, step_size], where the seq_length
+ * is equal output_height * output_width, and the step_size is equal
+ * input_channels * filter_height * filter_width.
+ *
+ * Reshape:
+ *     shape of colData             shape of sequence matrix
+ *     [output_height,
+ *      output_width,
+ *      input_channels,    ======>    [seqLength, stepSize]
+ *      filter_height,
+ *      filter_width]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, typename DeviceContext, typename T>
+class Im2ColFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+template <ColFormat Format, typename DeviceContext, typename T>
+class Col2ImFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/im2col_cfo_cpu.h
+++ b/core/paddlefl_mpc/operators/math/im2col_cfo_cpu.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/**
+ * The most common im2col algorithm.
+ * Support dilation, stride and padding.
+ */
+template <typename T>
+inline void im2col_common(const framework::Tensor& im,
+                          const std::vector<int>& dilation,
+                          const std::vector<int>& stride,
+                          const std::vector<int>& padding,
+                          framework::Tensor* col,
+                          const DataLayout data_layout = DataLayout::kNCHW) {
+  int im_channels =
+      (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
+  int im_height =
+      (data_layout != DataLayout::kNHWC ? im.dims()[1] : im.dims()[0]);
+  int im_width =
+      (data_layout != DataLayout::kNHWC ? im.dims()[2] : im.dims()[1]);
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+  int channels_col = im_channels * filter_height * filter_width;
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % filter_width;
+    int h_offset = (c / filter_width) % filter_height;
+    int c_im = c / (filter_width * filter_height);
+    for (int h = 0; h < output_height; ++h) {
+      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+      for (int w = 0; w < output_width; ++w) {
+        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+        int im_idx;
+        if (data_layout != DataLayout::kNHWC) {
+          im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+        } else {
+          im_idx = (im_row_idx * im_width + im_col_idx) * im_channels + c_im;
+        }
+        int col_idx = (c * output_height + h) * output_width + w;
+        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                             im_col_idx < 0 || im_col_idx >= im_width)
+                                ? static_cast<T>(0)
+                                : im_data[im_idx];
+      }
+    }
+  }
+}
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 0
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph0pw0(
+    const framework::Tensor& im, framework::Tensor* col,
+    const DataLayout data_layout = DataLayout::kNCHW) {
+  int im_channels =
+      (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
+  int im_height =
+      (data_layout != DataLayout::kNHWC ? im.dims()[1] : im.dims()[0]);
+  int im_width =
+      (data_layout != DataLayout::kNHWC ? im.dims()[2] : im.dims()[1]);
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int col_matrix_width = output_width * output_height;
+  int im_size = im_height * im_width;
+  size_t copy_size = sizeof(T) * output_width;
+  const T* im_data_oh = im_data;
+  T* dst_data_oh = col_data;
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* src_data_ic = im_data_oh;
+    T* dst_data = dst_data_oh;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = src_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          if (data_layout != DataLayout::kNHWC) {
+            std::memcpy(dst_data, src_data + kw, copy_size);
+          } else {
+            for (int kow = 0; kow < output_width; ++kow) {
+              dst_data[kow] =
+                  im_data[((oh + kh) * im_width + kw + kow) * im_channels + ic];
+            }
+          }
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+      src_data_ic = src_data_ic + im_size;
+    }
+    im_data_oh = im_data_oh + im_width;
+    dst_data_oh = dst_data_oh + output_width;
+  }
+}
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 1
+ * and filter_width == 1 have a special implementation
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
+                                      framework::Tensor* col,
+                                      const DataLayout data_layout) {
+  int im_channels =
+      (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
+  int im_height =
+      (data_layout != DataLayout::kNHWC ? im.dims()[1] : im.dims()[0]);
+  int im_width =
+      (data_layout != DataLayout::kNHWC ? im.dims()[2] : im.dims()[1]);
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+  constexpr int plh = 1;
+  constexpr int prh = 1;
+  constexpr int plw = 1;
+  constexpr int prw = 1;
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int im_size = im_height * im_width;
+  int col_matrix_width = output_width * output_height;
+  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
+  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
+  // fill height padding
+  {
+    size_t copy_size = sizeof(T) * output_width;
+    T* col_start_l = col_data;
+    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
+                     col_matrix_width - output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_l = col_start_l;
+      T* dst_data_r = col_start_r;
+      for (int kw = 0; kw < filter_width; ++kw) {
+        std::memset(dst_data_l, 0, copy_size);
+        std::memset(dst_data_r, 0, copy_size);
+        dst_data_l = dst_data_l + col_matrix_width;
+        dst_data_r = dst_data_r + col_matrix_width;
+      }
+      col_start_l = col_start_l + col_block_ic;
+      col_start_r = col_start_r + col_block_ic;
+    }
+  }
+  auto pad = static_cast<T>(0);
+  if (filter_width == 1) {
+    // fill width padding
+    T* dst_data_ic = col_data;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_kh = dst_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        T* dst_data = dst_data_kh;
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width - 1;
+          *dst_data = pad;
+          ++dst_data;
+        }
+        dst_data_kh = dst_data_kh + col_block_fh;
+      }
+      dst_data_ic = dst_data_ic + col_block_ic;
+    }
+    // fill core
+    size_t copy_size = sizeof(T) * (output_width - plw - prw);
+    for (int oh = 0; oh < output_height; ++oh) {
+      const T* im_data_start =
+          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+      T* dst_data = col_data + oh * output_width;
+      for (int ic = 0; ic < im_channels; ++ic) {
+        const T* src_data = im_data_start + ic * im_size;
+        for (int kh = 0; kh < filter_height; ++kh) {
+          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                         kh > (filter_height - prh - 1))) {
+            dst_data = dst_data + col_matrix_width;
+            continue;
+          }
+          if (data_layout != DataLayout::kNHWC) {
+            std::memcpy(dst_data + plw, src_data, copy_size);
+          } else {
+            for (int kow = 0; kow < output_width - plw - prw; ++kow) {
+              dst_data[plw + kow] =
+                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
+                           kow) *
+                              im_channels +
+                          ic];
+            }
+          }
+          dst_data = dst_data + col_matrix_width;
+          src_data = src_data + im_width;
+        }
+      }
+    }
+    return;
+  }
+  // filter_width != 1
+  // fill width padding
+  T* dst_data_ic = col_data;
+  for (int ic = 0; ic < im_channels; ++ic) {
+    T* dst_data_kh = dst_data_ic;
+    for (int kh = 0; kh < filter_height; ++kh) {
+      for (T* dst_data :
+           {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width +
+                             output_width - 1}) {
+        // TODO(TJ): from plh, saving repeated assignment
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width;
+        }
+      }
+      dst_data_kh = dst_data_kh + col_block_fh;
+    }
+    dst_data_ic = dst_data_ic + col_block_ic;
+  }
+  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
+  // (output_width-1)}
+  // length of copy_size is equal kw.
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+    T* dst_data = col_data + oh * output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = im_data_start + ic * im_size;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                       kh > (filter_height - prh - 1))) {
+          dst_data = dst_data + filter_width * col_matrix_width;
+          continue;
+        }
+        // TODO(TJ): reuse plw-kw outside this for
+        // try to unify
+        for (int kw = 0; kw < plw; ++kw) {
+          if (data_layout != DataLayout::kNHWC) {
+            std::memcpy(dst_data + (plw - kw), src_data,
+                        sizeof(T) * (output_width - (plw - kw)));
+          } else {
+            for (int kow = 0; kow < output_width - (plw - kw); ++kow) {
+              dst_data[plw - kw + kow] =
+                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
+                           kow) *
+                              im_channels +
+                          ic];
+            }
+          }
+          dst_data = dst_data + col_matrix_width;
+        }
+        for (int kw = plw; kw < filter_width - prw; ++kw) {
+          if (data_layout != DataLayout::kNHWC) {
+            std::memcpy(dst_data, src_data + (kw - plw),
+                        sizeof(T) * output_width);
+          } else {
+            for (int kow = 0; kow < output_width; ++kow) {
+              dst_data[kow] =
+                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
+                           kw - plw + kow) *
+                              im_channels +
+                          ic];
+            }
+          }
+          dst_data = dst_data + col_matrix_width;
+        }
+        int i = 1;
+        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
+          if (data_layout != DataLayout::kNHWC) {
+            std::memcpy(dst_data, src_data + (kw - plw),
+                        sizeof(T) * (output_width - i));
+          } else {
+            for (int kow = 0; kow < output_width - i; ++kow) {
+              dst_data[kow] =
+                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
+                           kw - plw + kow) *
+                              im_channels +
+                          ic];
+            }
+          }
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/math_function.cc
+++ b/core/paddlefl_mpc/operators/math/math_function.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "core/paddlefl_mpc/operators/math/math_function.h"
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+    void operator()(const platform::CPUDeviceContext& context,
+                    const framework::Tensor& input,
+                    const framework::Tensor& vector, framework::Tensor* output) {
+        auto in_dims = input.dims();
+        auto size = input.numel() / in_dims[0];
+        PADDLE_ENFORCE_EQ(vector.numel(), size);
+        PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+        auto in = framework::EigenMatrix<T>::From(input);
+        auto vec = framework::EigenVector<T>::Flatten(vector);
+        auto out = framework::EigenMatrix<T>::From(*output);
+        for (int64_t i = 0; i < in_dims[0]; ++i) {
+            out.chip(i, 0) = in.chip(i, 0) + vec;
+        }
+    }
+};
+template struct RowwiseAdd<platform::CPUDeviceContext, int64_t>;
+using float16 = paddle::platform::float16;
+template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::CPUDeviceContext, float>;
+template struct SetConstant<platform::CPUDeviceContext, double>;
+template struct SetConstant<platform::CPUDeviceContext, int>;
+template struct SetConstant<platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<platform::CPUDeviceContext, bool>;
+template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
+#define DEFINE_CPU_TRANS(RANK)                                             \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
+                            RANK>;                                         \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;    \
+  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;
+DEFINE_CPU_TRANS(1);
+DEFINE_CPU_TRANS(2);
+DEFINE_CPU_TRANS(3);
+DEFINE_CPU_TRANS(4);
+DEFINE_CPU_TRANS(5);
+DEFINE_CPU_TRANS(6);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/math_function.h
+++ b/core/paddlefl_mpc/operators/math/math_function.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename DeviceContext, typename T>
+struct RowwiseAdd {
+    void operator()(const DeviceContext& context, const framework::Tensor& input,
+                    const framework::Tensor& vec, framework::Tensor* output);
+};
+template <typename DeviceContext, typename T>
+struct SetConstant {
+    void operator()(const DeviceContext& context, framework::Tensor* tensor,
+                    T num);
+};
+template <typename DeviceContext, typename T, int Rank>
+struct Transpose {
+    void operator()(const DeviceContext& context, const framework::Tensor& in,
+                    framework::Tensor* out, const std::vector<int>& axis);
+};
+template <typename DeviceContext, typename T>
+struct ColwiseSum {
+    void operator()(const DeviceContext& context, const framework::Tensor& input,
+                    framework::Tensor* vec);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/math_function_impl.h
+++ b/core/paddlefl_mpc/operators/math/math_function_impl.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "core/paddlefl_mpc/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename DeviceContext, typename T>
+void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
+        framework::Tensor* tensor,
+        T num) {
+    auto t = framework::EigenVector<T>::Flatten(*tensor);
+    t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
+}
+template <typename DeviceContext, typename T, int Rank>
+void Transpose<DeviceContext, T, Rank>::operator()(
+    const DeviceContext& context, const framework::Tensor& in,
+    framework::Tensor* out, const std::vector<int>& axis) {
+    Eigen::array<int, Rank> permute;
+    for (int i = 0; i < Rank; i++) {
+        permute[i] = axis[i];
+    }
+    auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
+    auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
+    auto* dev = context.eigen_device();
+    eigen_out.device(*dev) = eigen_in.shuffle(permute);
+}
+template <typename DeviceContext, typename T>
+void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+        const framework::Tensor& input,
+        framework::Tensor* out) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(*out);
+    vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
+}
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// colwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class ColwiseSum<platform::CPUDeviceContext, T> {
+public:
+    void operator()(const platform::CPUDeviceContext& context,
+                    const framework::Tensor& input, framework::Tensor* out) {
+        auto& in_dims = input.dims();
+        auto height = in_dims[0];
+        auto size = in_dims[1];
+        PADDLE_ENFORCE_EQ(out->numel(), size);
+        T* out_buf = out->mutable_data<T>(out->place());
+        const T* in_buf = input.data<T>();
+        for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+            for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+                if (i == 0) {
+                    out_buf[j] = in_buf[i * size + j];
+                } else {
+                    out_buf[j] += in_buf[i * size + j];
+                }
+            }
+        }
+    }
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/sequence2batch.cc
+++ b/core/paddlefl_mpc/operators/math/sequence2batch.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "core/paddlefl_mpc/operators/math/sequence2batch.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
+public:
+    void operator()(const platform::CPUDeviceContext& context,
+                    const framework::Tensor& src,
+                    framework::Vector<size_t> index_lod, framework::Tensor* dst,
+                    bool is_src_index) {
+        size_t* index = index_lod.data();
+        auto src_dims = src.dims();
+        auto dst_dims = dst->dims();
+        PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
+                          "The src must be matrix with rank 2.");
+        PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
+                          "The dst must be matrix with rank 2.");
+        PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
+                          "The width of src and dst must be same.");
+        auto height = dst_dims[0];
+        auto width = dst_dims[1];
+        auto* src_data = src.data<T>();
+        auto* dst_data = dst->data<T>();
+        const int sz = width * sizeof(T);
+        if (is_src_index) {
+            for (int i = 0; i < height; ++i) {
+                memcpy(dst_data + i * width, src_data + index[i] * width, sz);
+            }
+        } else {
+            for (int i = 0; i < height; ++i) {
+                memcpy(dst_data + index[i] * width, src_data + i * width, sz);
+            }
+        }
+    }
+};
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, int64_t>;
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, int64_t>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/sequence2batch.h
+++ b/core/paddlefl_mpc/operators/math/sequence2batch.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename DeviceContext, typename T>
+class CopyMatrixRowsFunctor {
+public:
+    // If is_src_index is true,
+    // copy the indexed rows of input src to the output dst.
+    // If is_src_index is false,
+    // copy the input src to the indexed rows of output dst.
+    // The indexed rows are based on the input index.
+    void operator()(const DeviceContext& context, const framework::Tensor& src,
+                    framework::Vector<size_t> index_lod, framework::Tensor* dst,
+                    bool is_src_index);
+};
+template <typename DeviceContext, typename T>
+class LoDTensor2BatchFunctor {
+    // Calculate the length of each sequence and
+    // sort sequence index by the length.
+    // example:  sequences = {s0, s1, s2}
+    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+    //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
+    //
+    struct SeqInfo {
+        SeqInfo(size_t start, size_t length, size_t seq_idx)
+            : start(start), length(length), seq_idx(seq_idx) {}
+        size_t start;
+        size_t length;
+        size_t seq_idx;
+    };
+public:
+    void operator()(const DeviceContext& context,
+                    const framework::LoDTensor& lod_tensor,
+                    framework::LoDTensor* batch, bool is_cal_batch_lod,
+                    bool is_reverse = false) const {
+        if (!is_cal_batch_lod) {
+            auto lods = batch->lod();
+            PADDLE_ENFORCE_GT(lods.size(), 2UL,
+                              "The LoD of LoDTensor should inlcude at least 2-level "
+                              "sequence information.");
+            PADDLE_ENFORCE_EQ(
+                lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]),
+                "The LoD information should be consistent with the dims.");
+            CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
+            to_batch(context, lod_tensor, lods[1], batch, true);
+            return;
+        }
+        auto lods = lod_tensor.lod();
+        PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+        const auto& lod = lods[0];
+        std::vector<SeqInfo> seq_info;
+        for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
+            size_t length = lod[seq_id + 1] - lod[seq_id];
+            seq_info.emplace_back(lod[seq_id], length, seq_id);
+        }
+        std::sort(seq_info.begin(), seq_info.end(),
+        [](SeqInfo a, SeqInfo b) {
+            return a.length > b.length;
+        });
+        // Calculate the start position of each batch.
+        // example:  sequences = {s0, s1, s2}
+        //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+        //           max_seqlen = 5,
+        //           batchIndex = {b0, b1, b2, b3, b4}
+        //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+        //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
+        //              batch_start_positions[0] = len(b0)
+        //              batch_start_positions[1] = len(b0) + len(b1)
+        //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
+        //              ...
+        //           seq2batch_idx[12] = {4, 0, 9,
+        //                                5, 1, 10,
+        //                                6, 2, 11,
+        //                                7, 3,
+        //                                8}
+        //           seq_order = {1, 0, 2}, the sort order.
+        //               where 1 is the second sequence,
+        //                     0 is the first sequence,
+        //                     2 is the third sequence.
+        // The max_seqlen represents batch size after rearranging the
+        // input LodTensor. It is also the maximum length of input sequence.
+        paddle::framework::LoD batch_lods;
+        batch_lods.emplace_back(std::vector<size_t> {0});
+        batch_lods.emplace_back(std::vector<size_t> {0});
+        batch_lods.emplace_back(std::vector<size_t> {0});
+        // batch_lods[0] is the start positions for batch LoDTensor
+        size_t max_seqlen = seq_info[0].length;
+        batch_lods[0].resize(max_seqlen + 1);
+        // batch_lods[1] is the raw index in the input LoDTensor
+        batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+        // batch_lods[2] is the sort order for the input LoDTensor.
+        batch_lods[2].resize(seq_info.size());
+        size_t* batch_starts = batch_lods[0].data();
+        size_t* seq2batch_idx = batch_lods[1].data();
+        batch_starts[0] = 0;
+        for (size_t n = 0; n < max_seqlen; n++) {
+            size_t batch_id = batch_starts[n];
+            for (size_t i = 0; i < seq_info.size(); ++i) {
+                size_t seq_len = seq_info[i].length;
+                size_t start = seq_info[i].start;
+                if (n < seq_len) {
+                    seq2batch_idx[batch_id] =
+                        is_reverse ? start + seq_len - 1 - n : start + n;
+                    batch_id++;
+                } else {
+                    break;
+                }
+            }
+            batch_starts[n + 1] = batch_id;
+        }
+        size_t* seq_order = batch_lods[2].data();
+        for (size_t i = 0; i < seq_info.size(); ++i) {
+            seq_order[i] = seq_info[i].seq_idx;
+        }
+        batch->set_lod(batch_lods);
+        CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
+        to_batch(context, lod_tensor, batch_lods[1], batch, true);
+    }
+};
+template <typename DeviceContext, typename T>
+class Batch2LoDTensorFunctor {
+public:
+    void operator()(const DeviceContext& context,
+                    const framework::LoDTensor& batch,
+                    framework::LoDTensor* lod_tensor) const {
+        auto in_lod = batch.lod();
+        PADDLE_ENFORCE_GT(in_lod.size(), 2UL,
+                          "The LoD of LoDTensor should inlcude at least 2-level "
+                          "sequence information.");
+        PADDLE_ENFORCE_EQ(
+            in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]),
+            "The LoD information should be consistent with the dims.");
+        CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
+        to_seq(context, batch, in_lod[1], lod_tensor, false);
+    }
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/vol2col.cc
+++ b/core/paddlefl_mpc/operators/math/vol2col.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "./vol2col.h"
+#include <vector>
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * vol = [input_channels, input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Vol2ColFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* col,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
+                      "The dimension of vol should be 4.");
+    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
+                      "The dimension of col should be 7.");
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+    // changed
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "mismatching.");
+    const T* vol_data = vol.data<T>();
+    T* col_data = col->data<T>();
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int c_in = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+            int col_idx =
+                ((c * output_depth + d) * output_height + h) * output_width + w;
+            int vol_idx;
+            if (data_layout != DataLayout::kNHWC) {
+              vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                            input_width +
+                        w_pad;
+            } else {
+              vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                            input_channels +
+                        c_in;
+            }
+            col_data[col_idx] =
+                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
+                    ? static_cast<T>(0)
+                    : vol_data[vol_idx];
+          }
+        }
+      }
+    }
+  }
+};
+/*
+ * vol = [input_channels,input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Col2VolFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* vol,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
+                      "The dimension of vol should be 4.");
+    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
+                      "The dimension of col should be 7.");
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "mismatching.");
+    T* vol_data = vol->data<T>();
+    const T* col_data = col.data<T>();
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int cIm = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
+              int vol_idx;
+              if (data_layout != DataLayout::kNHWC) {
+                vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
+                              input_width +
+                          w_pad;
+              } else {
+                vol_idx =
+                    ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                        input_channels +
+                    cIm;
+              }
+              int col_idx =
+                  ((c * output_depth + d) * output_height + h) * output_width +
+                  w;
+              vol_data[vol_idx] += col_data[col_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+template class Vol2ColFunctor<platform::CPUDeviceContext, int64_t>;
+template class Col2VolFunctor<platform::CPUDeviceContext, int64_t>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/vol2col.h
+++ b/core/paddlefl_mpc/operators/math/vol2col.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace operators {
+namespace math {
+using DataLayout = framework::DataLayout;
+/*
+ * \brief Converts the feature data of four dimensions(CDHW) into a colData of
+ *        seven dimensions in the Vol2ColFunctor calculation,
+ *        And in the Col2VolFunctor calculation, it is reversed.
+ *
+ * \param volData   Vol data.
+ * \param volShape  The shape of volData,
+ *                 [input_channels, input_depth, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * \param dilations    dilation data.
+ * \param 3-dimension  [dilation_depth, dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 3-dimension  [stride_depth, stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 3-dimension  [d_pad, h_pad, w_pad].
+ *
+ * The shape of colData is:
+ * [input_channels, filter_depth, filter_height, filter_width, output_depth,
+ * output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_depth * filter_height * filter_width, and the width
+ * is equal output_depth * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_depth,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_depth,
+ *      output_height,
+ *      output_width]
+ *
+ * \note The caller needs to ensure that volShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <typename DeviceContext, typename T>
+class Vol2ColFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* col,
+                  const DataLayout data_layout = DataLayout::kNCHW) const;
+};
+template <typename DeviceContext, typename T>
+class Col2VolFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* vol,
+                  const DataLayout data_layout = DataLayout::kNCHW) const;
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/mpc_adam_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_adam_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "mpc_adam_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include <string>
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+class MpcAdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override;
+};
+void MpcAdamOp::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("Param"), true,
+      platform::errors::NotFound("Input(Param) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("Grad"), true,
+      platform::errors::NotFound("Input(Grad) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Moment1"), true,
+                    platform::errors::NotFound(
+                        "Input(Moment1) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Moment2"), true,
+                    platform::errors::NotFound(
+                        "Input(Moment2) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
+                    platform::errors::NotFound(
+                        "Input(LearningRate) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Beta1Pow"), true,
+                    platform::errors::NotFound(
+                        "Input(Beta1Pow) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Beta2Pow"), true,
+                    platform::errors::NotFound(
+                        "Input(Beta2Pow) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
+                    platform::errors::NotFound(
+                        "Output(ParamOut) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment1Out"), true,
+                    platform::errors::NotFound(
+                        "Output(Moment1Out) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment2Out"), true,
+                    platform::errors::NotFound(
+                        "Output(Moment2Out) of AdamOp should not be null."));
+  auto lr_dims = ctx->GetInputDim("LearningRate");
+  PADDLE_ENFORCE_NE(
+      framework::product(lr_dims), 0,
+      platform::errors::InvalidArgument(
+          "The number of LearningRate shall not be 0, but received %d. Maybe "
+          "the Input variable LearningRate has not "
+          "been initialized. You may need to confirm "
+          "if you put exe.run(startup_program) "
+          "after optimizer.minimize function.",
+          framework::product(lr_dims)));
+  PADDLE_ENFORCE_EQ(
+      framework::product(lr_dims), 1,
+      platform::errors::InvalidArgument(
+          "Learning rate should have 1 dimension, but received %d",
+          framework::product(lr_dims)));
+  auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+  VLOG(3) << "dims of Beta1Pow : [" << beta1_pow_dims << "]";
+  PADDLE_ENFORCE_GE(framework::product(beta1_pow_dims), 1,
+                    platform::errors::InvalidArgument(
+                        "The size of Beta1 power accumulator should be greater "
+                        "than 0, but received %d.",
+                        framework::product(beta1_pow_dims)));
+  auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
+  VLOG(3) << "dims of Beta2Pow : [" << beta2_pow_dims << "]";
+  PADDLE_ENFORCE_GE(framework::product(beta2_pow_dims), 1,
+                    platform::errors::InvalidArgument(
+                        "The size of Beta2 power accumulator should be greater "
+                        "than 0, but received %d.",
+                        framework::product(beta2_pow_dims)));
+  auto param_dims = ctx->GetInputDim("Param");
+  if (ctx->GetInputsVarType("Grad")[0] ==
+      framework::proto::VarType::LOD_TENSOR) {
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        platform::errors::InvalidArgument(
+            "Param and Grad input of AdamOp should have same dimension. But "
+            "received Param dims: [%s], Grad dims: [%s].",
+            param_dims, ctx->GetInputDim("Grad")));
+  }
+  PADDLE_ENFORCE_EQ(
+      param_dims, ctx->GetInputDim("Moment1"),
+      platform::errors::InvalidArgument(
+          "Param and Moment1 input of AdamOp should have same dimension. But "
+          "received Param dims: [%s], Moment1 dims: [%s].",
+          param_dims, ctx->GetInputDim("Moment1")));
+  PADDLE_ENFORCE_EQ(
+      param_dims, ctx->GetInputDim("Moment2"),
+      platform::errors::InvalidArgument(
+          "Param and Moment2 input of AdamOp should have same dimension. But "
+          "received Param dims: [%s], Moment2 dims: [%s].",
+          param_dims, ctx->GetInputDim("Moment2")));
+  ctx->SetOutputDim("ParamOut", param_dims);
+  ctx->SetOutputDim("Moment1Out", param_dims);
+  ctx->SetOutputDim("Moment2Out", param_dims);
+  ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
+  ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims);
+}
+framework::OpKernelType MpcAdamOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Param");
+  return framework::OpKernelType(input_data_type, ctx.GetPlace());
+}
+framework::OpKernelType MpcAdamOp::GetKernelTypeForVar(
+    const std::string &var_name, const framework::Tensor &tensor,
+    const framework::OpKernelType &expected_kernel_type) const {
+  if (var_name == "Beta1Pow" || var_name == "Beta2Pow") {
+    return expected_kernel_type;
+  } else {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+}
+class MpcAdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment1", "(Tensor) Input first moment");
+    AddInput("Moment2", "(Tensor) Input second moment");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+    AddInput("Beta1Tensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as beta1, this has a higher priority than attr(beta1), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
+    AddInput("Beta2Tensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as beta2, this has a higher priority than attr(beta2), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment");
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
+    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-4) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-4f);
+    AddComment(R"DOC(
+Adam Optimizer.
+This implements the Adam optimizer from Section 2 of the Adam
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
+Adam updates:
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    mpc_adam, ops::MpcAdamOp, ops::MpcAdamOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    mpc_adam,
+    ops::MpcAdamOpKernel<paddle::platform::CPUDeviceContext, int64_t, float>);
--- a/core/paddlefl_mpc/operators/mpc_adam_op.h
+++ b/core/paddlefl_mpc/operators/mpc_adam_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+        http://www.apache.org/licenses/LICENSE-2.0
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License. */
+#pragma once
+#include "mpc_op.h"
+#include <math.h>
+#include "./math/math_function.h"
+#include "core/paddlefl_mpc/mpc_protocol/aby3_operators.h"
+namespace paddle {
+namespace operators {
+static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
+  const float* tensor_data = tensor->data<float>();
+  framework::Tensor cpu_tensor;
+  return tensor_data[0];
+}
+template <typename DeviceContext, typename T, typename T1>
+class MpcAdamOpKernel : public MpcOpKernel<T> {
+  public:
+    void ComputeImpl(const framework::ExecutionContext &ctx) const override{
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.InputNames("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
+    using paddle::framework::LoDTensor;
+    T1 epsilon = static_cast<T1>(ctx.Attr<float>("epsilon"));
+    auto* param = ctx.Input<LoDTensor>("Param");
+    auto* grad_var = ctx.InputVar("Grad");
+    auto* mom1 = ctx.Input<LoDTensor>("Moment1");
+    auto* mom2 = ctx.Input<LoDTensor>("Moment2");
+    auto* lr = ctx.Input<LoDTensor>("LearningRate");
+    auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
+    auto* param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
+    auto* mom2_out = ctx.Output<LoDTensor>("Moment2Out");
+    auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
+    auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
+    T1 beta1 = static_cast<T1>(ctx.Attr<float>("beta1"));
+    if (ctx.HasInput("Beta1Tensor")) {
+      auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta1Tensor) size must be 1, but get %d",
+                            beta1_tensor->numel()));
+      beta1 = static_cast<T1>(GetAttrFromTensor(beta1_tensor));
+    }
+    T1 beta2 = static_cast<T1>(ctx.Attr<float>("beta2"));
+    if (ctx.HasInput("Beta2Tensor")) {
+      auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta2Tensor) size must be 1, but get %d",
+                            beta2_tensor->numel()));
+      beta2 = static_cast<T1>(GetAttrFromTensor(beta2_tensor));
+    }
+    VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
+            << "beta2_pow.numel() : " << beta2_pow->numel();
+    VLOG(3) << "param.numel(): " << param->numel();
+    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta1 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta1_pow_out->numel()));
+    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta2 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta2_pow_out->numel()));
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto* grad = ctx.Input<LoDTensor>("Grad");
+      // AdamFunctor<T, CPUAdam> functor(
+      //     beta1, beta2, epsilon, beta1_pow->data<T>(), beta2_pow->data<T>(),
+      //     mom1->data<T>(), mom1_out->mutable_data<T>(ctx.GetPlace()),
+      //     mom2->data<T>(), mom2_out->mutable_data<T>(ctx.GetPlace()),
+      //     lr->data<T>(), grad->data<T>(), param->data<T>(),
+      //     param_out->mutable_data<T>(ctx.GetPlace()));
+      // functor(param->numel());
+      T1 lr_value = *lr->template data<T1>();
+      T1 beta1_pow_ = *beta1_pow->template data<T1>();
+      T1 beta2_pow_ = *beta2_pow->template data<T1>();
+      double lr_ =  lr_value * sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
+      framework::Tensor temp;
+      temp.mutable_data<T>(param->dims(), ctx.GetPlace());
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(grad, (1 - beta1), &temp);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(mom1, beta1, mom1_out);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->add(mom1_out, &temp, mom1_out);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(grad, (1 - beta2), &temp);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->mul(grad, &temp, &temp);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(mom2, beta2, mom2_out);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->add(mom2_out, &temp, mom2_out);
+      // mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(grad, lr[0], &temp);
+      // mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->sub(param, &temp, param_out);
+      math::SetConstant<DeviceContext, T> set_const;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      set_const(
+          dev_ctx,
+          &temp,
+          T(epsilon * pow(2, mpc::ABY3_SCALING_FACTOR) / 3));
+      // temp = epsilon + mom2_out
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->add(mom2_out, &temp, &temp);
+      // temp = 1 / sqrt(epsilon + mom2_out)
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->inverse_square_root(&temp, &temp);
+      // temp = mom1_out / sqrt(epsilon + mom2_out)
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->mul(mom1_out, &temp, &temp);
+      // temp = lr * mom1_out / sqrt(epsilon + mom2_out)
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(&temp, lr_, &temp);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->sub(param, &temp, param_out);
+      beta1_pow_out->mutable_data<T1>(ctx.GetPlace())[0] =
+          beta1 * beta1_pow->template data<T1>()[0];
+      beta2_pow_out->mutable_data<T1>(ctx.GetPlace())[0] =
+          beta2 * beta2_pow->template data<T1>()[0];
+    } else {
+      PADDLE_THROW("Variable type not supported by adam_op");
+    }
+    }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/mpc_batch_norm_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_batch_norm_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "mpc_batch_norm_op.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+class MpcBatchNormOp : public framework::OperatorWithKernel {
+public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+    void InferShape(framework::InferShapeContext* ctx) const override{
+        OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchNorm");
+        OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNorm");
+        OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "BatchNorm");
+        OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "BatchNorm");
+        OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance", "BatchNorm");
+        OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "BatchNorm");
+        bool is_test = ctx->Attrs().Get<bool>("is_test");
+        bool trainable_stats = ctx->Attrs().Get<bool>("trainable_statistics");
+        bool test_mode = is_test && (!trainable_stats);
+        if (!test_mode) {
+            OP_INOUT_CHECK(ctx->HasOutput("MeanOut"), "Output", "MeanOut", "BatchNorm");
+            OP_INOUT_CHECK(ctx->HasOutput("VarianceOut"), "Output", "VarianceOut",
+                           "BatchNorm");
+            OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
+                           "BatchNorm");
+            OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance",
+                           "BatchNorm");
+        }
+        // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
+        PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
+                          platform::errors::InvalidArgument(
+                              "Mean and MeanOut should share the same memory"));
+        PADDLE_ENFORCE_EQ(
+            ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0],
+            platform::errors::InvalidArgument(
+                "Variance and VarianceOut should share the same memory"));
+        const auto x_dims = ctx->GetInputDim("X");
+        const DataLayout data_layout = framework::StringToDataLayout(
+            ctx->Attrs().Get<std::string>("data_layout"));
+        if (ctx->IsRuntime() && ctx->HasInput("MomentumTensor")) {
+            auto mom = ctx->Inputs("MomentumTensor");
+            PADDLE_ENFORCE_EQ(mom.size(), 1,
+                              platform::errors::InvalidArgument(
+                                  "The input tensor MomentumTensor's size must be 1"
+                                  "But received: MomentumTensor's size is [%d]",
+                                  mom.size()));
+        }
+        PADDLE_ENFORCE_GE(
+            x_dims.size(), 3,
+            platform::errors::InvalidArgument(
+                "ShapeError: the dimension of input "
+                "X must greater than or equal to 3. But received: the shape of input "
+                "X = [%s], the dimension of input X =[%d]",
+                x_dims, x_dims.size()));
+        PADDLE_ENFORCE_LE(
+            x_dims.size(), 6,
+            platform::errors::InvalidArgument(
+                "ShapeError: the dimension of input X "
+                "must smaller than or equal to 6. But received: the shape of input X "
+                "= [%s], the dimension of input X = [%d]",
+                x_dims, x_dims.size()));
+        const int64_t C =
+            ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+                 ? x_dims[2]
+                 : x_dims[x_dims.size() - 1]);
+        auto scale_dim = ctx->GetInputDim("Scale");
+        auto bias_dim = ctx->GetInputDim("Bias");
+        VLOG(3) << "*** scale_dims: " << scale_dim;
+        VLOG(3) << "*** bias_dims: " << bias_dim;
+        VLOG(3) << "*** mean_dims: " << ctx->GetInputDim("Mean");
+        VLOG(3) << "*** variance_dims: " << ctx->GetInputDim("Variance");
+        //VLOG(3) << "*** Y_dims: " << ctx->GetInputDim("Y");
+        PADDLE_ENFORCE_EQ(
+            scale_dim.size(), 2UL,
+            platform::errors::InvalidArgument(
+                "ShapeError: the dimension of scale must equal to 2."
+                "But received: the shape of scale is [%s], the dimension "
+                "of scale is [%d]",
+                scale_dim, scale_dim.size()));
+        PADDLE_ENFORCE_EQ(bias_dim.size(), 2UL,
+            platform::errors::InvalidArgument(
+                "ShapeError: the dimension of bias must equal to 2."
+                "But received: the shape of bias is [%s],the dimension "
+                "of bias is [%d]",
+                bias_dim, bias_dim.size()));
+        bool check = true;
+        if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
+                                    framework::product(bias_dim) <= 0)) {
+            check = false;
+        }
+        if (check) {
+            PADDLE_ENFORCE_EQ(scale_dim[1], C,
+                platform::errors::InvalidArgument(
+                    "ShapeError: the shape of scale must equal to [%d]"
+                    "But received: the shape of scale is [%d]",
+                    C, scale_dim[1]));
+            PADDLE_ENFORCE_EQ(bias_dim[1], C,
+                platform::errors::InvalidArgument(
+                    "ShapeError: the shape of bias must equal to [%d]"
+                    "But received: the shape of bias is [%d]",
+                    C, bias_dim[1]));
+        }
+        ctx->SetOutputDim("Y", x_dims);
+        ctx->SetOutputDim("MeanOut", {2, C}); // 2: share_num
+        ctx->SetOutputDim("VarianceOut", {2, C});
+        ctx->SetOutputDim("SavedMean", {2, C});
+        ctx->SetOutputDim("SavedVariance", {2, C});
+        ctx->ShareLoD("X", "Y");
+  }
+protected:
+    framework::OpKernelType GetExpectedKernelType(const framework::ExecutionContext& ctx) const {
+        framework::LibraryType library_{framework::LibraryType::kPlain};
+        std::string data_format = "AnyLayout";
+        framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+        return framework::OpKernelType(
+            OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(),
+            layout_, library_);
+    }
+    framework::OpKernelType GetKernelTypeForVar(
+            const std::string& var_name, const Tensor& tensor,
+            const framework::OpKernelType& expected_kernel_type) const {
+        return framework::OpKernelType(expected_kernel_type.data_type_, 
+                                       tensor.place(), tensor.layout());
+    }
+};
+class MpcBatchNormGradOp : public framework::OperatorWithKernel {
+public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+    void InferShape(framework::InferShapeContext* ctx) const override{
+        // check input
+        OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNormGrad");
+        OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
+                       framework::GradVarName("Y"), "BatchNormGrad");
+        OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
+                       "BatchNormGrad");
+        OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
+                       "BatchNormGrad");
+        // check output
+        OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                       framework::GradVarName("X"), "BatchNormGrad");
+        const bool has_scale_grad = ctx->HasOutput(framework::GradVarName("Scale"));
+        const bool has_bias_grad = ctx->HasOutput(framework::GradVarName("Bias"));
+        PADDLE_ENFORCE_EQ((has_scale_grad == has_bias_grad), true,
+                          platform::errors::NotFound(
+                              "Output(Scale@GRAD) and Output(Bias@GRAD) must be null "
+                              "or not be null at same time. But now, "
+                              "has Scale@Grad=[%d], has Bias@GRAD=[%d]",
+                              has_scale_grad, has_bias_grad));
+        const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
+        if (use_global_stats) {
+            PADDLE_ENFORCE_EQ(
+                !ctx->Attrs().Get<bool>("use_mkldnn"), true,
+                platform::errors::InvalidArgument(
+                    "Using global stats during training is not supported "
+                    "in gradient op kernel of batch_norm_mkldnn_op now."));
+        }
+        OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchNormGrad");
+        const auto x_dims = ctx->GetInputDim("X");
+        const DataLayout data_layout = framework::StringToDataLayout(
+            ctx->Attrs().Get<std::string>("data_layout"));
+        const int C =
+            ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+                ? x_dims[2]
+                : x_dims[x_dims.size() - 1]);
+        ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+        // has_scale_grad == has_bias_grad, judge has_scale_grad is enough
+        if (has_scale_grad) {
+            ctx->SetOutputDim(framework::GradVarName("Scale"), {2, C}); // 2: share_num
+            ctx->SetOutputDim(framework::GradVarName("Bias"), {2, C});
+        }
+    }
+protected:
+    framework::OpKernelType GetExpectedKernelType(const framework::ExecutionContext& ctx) const {
+        framework::LibraryType library_{framework::LibraryType::kPlain};
+        std::string data_format = "AnyLayout";
+        framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+        auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+        return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_);
+    }
+    framework::OpKernelType GetKernelTypeForVar(
+            const std::string& var_name, const Tensor& tensor,
+            const framework::OpKernelType& expected_kernel_type) const {
+        return framework::OpKernelType(expected_kernel_type.data_type_,
+                                       tensor.place(), tensor.layout());
+    }
+};
+class MpcBatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+    void Make() {
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
+  AddAttr<float>("momentum", "").SetDefault(0.9);
+  AddAttr<float>("epsilon", "")
+      .SetDefault(1e-5)
+      .AddCustomChecker([](const float &epsilon) {
+        PADDLE_ENFORCE_GE(
+            epsilon, 0.0f,
+            platform::errors::InvalidArgument(
+                "'epsilon' should be greater or equal than 0.0."));
+        PADDLE_ENFORCE_LE(epsilon, 0.001f,
+                          platform::errors::InvalidArgument(
+                              "'epsilon' should be less or equal than 0.001."));
+      });
+  AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
+  AddInput("X", "The input tensor");
+  AddInput("Scale",
+           "Scale is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Bias",
+           "Bias is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Mean",
+           "The global mean (for training) or "
+           "estimated mean (for testing)");
+  AddInput("Variance",
+           "The global variance (for training) "
+           "or estimated Variance (for testing)");
+  AddInput("MomentumTensor",
+           "(Tensor<float32>, optional) If provided, batch_norm will "
+           "use this as momentum, this has a higher priority than "
+           "attr(momentum), the shape of this tensor MUST BE [1].")
+      .AsDispensable();
+  AddOutput("Y", "result after normalization");
+  AddOutput("MeanOut",
+            "Share memory with Mean. "
+            "Store the global mean when training");
+  AddOutput("VarianceOut",
+            "Share memory with Variance. "
+            "Store the global Variance when training");
+  AddOutput("SavedMean",
+            "Mean of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("SavedVariance",
+            "Variance of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("ReserveSpace",
+            "Reserve GPU space for triggering the new semi-persistent "
+            "NHWC kernel")
+      .AsDispensable();
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("fuse_with_relu",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("use_global_stats",
+                "(bool, default false) Whether to use global mean and "
+                "variance. In inference or test mode, set use_global_stats "
+                "to true or is_test true. the behavior is equivalent. "
+                "In train mode, when setting use_global_stats True, the "
+                "global mean and variance are also used during train time, "
+                "the BN acts as scaling and shiffting.")
+      .SetDefault(false);
+  AddAttr<bool>("trainable_statistics",
+                "(bool, default false) Whether to calculate mean and variance "
+                "in test mode. If setting true in test mode, mean and variace "
+                "will be calculated by current batch statistics.")
+      .SetDefault(false);
+  AddComment(R"DOC(
+Batch Normalization.
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Can be used as a normalizer function for conv2d and fully_connected operations.
+The required data format for this layer is one of the following:
+1. NHWC `[batch, in_height, in_width, in_channels]`
+2. NCHW `[batch, in_channels, in_height, in_width]`
+)DOC");
+    }
+};
+template <typename T>
+class MpcBatchNormGradOpMaker : public framework::SingleGradOpMaker<T> {
+public:
+    using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+protected:
+    void Apply(GradOpPtr<T> op) const {
+        op->SetType(this->ForwardOpType() + "_grad");
+        op->SetInput("X", this->Input("X"));
+        op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+        op->SetInput("Scale", this->Input("Scale"));
+        op->SetInput("Bias", this->Input("Bias"));
+        op->SetInput("SavedMean", this->Output("SavedMean"));
+        op->SetInput("SavedVariance", this->Output("SavedVariance"));
+        if (this->HasOutput("ReserveSpace")) {
+            op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
+        }
+        // used when setting use_global_stats True during training
+        if (boost::get<bool>(this->GetAttr("use_global_stats"))) {
+            op->SetInput("Mean", this->Output("MeanOut"));
+            op->SetInput("Variance", this->Output("VarianceOut"));
+        }
+        op->SetAttrMap(this->Attrs());
+        op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+        op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
+        op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+    }
+};
+class MpcBatchNormOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+protected:
+    std::unordered_map<std::string, std::string>& GetInputOutputWithSameType() const override {
+        static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
+        return m;
+    }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    mpc_batch_norm, ops::MpcBatchNormOp, ops::MpcBatchNormOpMaker, 
+    ops::MpcBatchNormOpInferVarType,
+    ops::MpcBatchNormGradOpMaker<paddle::framework::OpDesc>,
+    ops::MpcBatchNormGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(mpc_batch_norm_grad, ops::MpcBatchNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    mpc_batch_norm, ops::MpcBatchNormKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    mpc_batch_norm_grad, ops::MpcBatchNormGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/core/paddlefl_mpc/operators/mpc_batch_norm_op.h
+++ b/core/paddlefl_mpc/operators/mpc_batch_norm_op.h
--- a/core/paddlefl_mpc/operators/mpc_elementwise_add_op.h
+++ b/core/paddlefl_mpc/operators/mpc_elementwise_add_op.h
@@ -69,6 +69,119 @@ private:
    int64_t n_;
 };
+template <typename T, typename DeviceContext>
+class MidWiseTransformIterator;
+template <typename T>
+class MidWiseTransformIterator<T, platform::CPUDeviceContext>
+    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
+                           T *, T &> {
+ public:
+  MidWiseTransformIterator(const T *ptr, int n, int post)
+      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
+  MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator++() {
+    ++j_;
+    if (UNLIKELY(j_ == post_)) {
+      ++i_;
+      j_ = 0;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
+    }
+    return *this;
+  }
+  MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
+    while (n-- > 0) {
+      ++j_;
+      if (UNLIKELY(j_ == post_)) {
+        ++i_;
+        j_ = 0;
+        if (UNLIKELY(i_ == n_)) {
+          i_ = 0;
+        }
+      }
+    }
+    return *this;
+  }
+  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
+                      &rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+  bool operator!=(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
+                      &rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+  const T &operator*() { return ptr_[i_]; }
+ private:
+  const T *ptr_;
+  int64_t i_;
+  int64_t j_;
+  int64_t n_;
+  int64_t post_;
+};
+template <typename Functor, typename T, typename DeviceContext,
+          typename OutType = T>
+class TransformFunctor {
+ public:
+  TransformFunctor(const framework::Tensor *x, const framework::Tensor *y,
+                   framework::Tensor *z, const DeviceContext &ctx, Functor func,
+                   const bool is_xsize_larger = true)
+      : x_(x->data<T>()),
+        y_(y->data<T>()),
+        z_(z->mutable_data<OutType>(ctx.GetPlace())),
+        nx_(x->numel()),
+        ctx_(ctx),
+        func_(func),
+        is_xsize_larger_(is_xsize_larger) {
+    if (is_xsize_larger_ == false) {
+      nx_ = y->numel();
+    }
+  }
+  inline void Run() const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
+  }
+  inline void RunRowWise(int n, int pre) const {
+    platform::Transform<DeviceContext> trans;
+    if (is_xsize_larger_) {
+      trans(ctx_, x_, x_ + nx_,
+            RowwiseTransformIterator<T, DeviceContext>(y_, n), z_, func_);
+    } else {
+      trans(ctx_, y_, y_ + nx_,
+            RowwiseTransformIterator<T, DeviceContext>(x_, n), z_, func_);
+    }
+  }
+  inline void RunMidWise(int n, int pre, int post) const {
+    platform::Transform<DeviceContext> trans;
+    if (is_xsize_larger_) {
+      trans(ctx_, x_, x_ + nx_,
+            MidWiseTransformIterator<T, DeviceContext>(y_, n, post), z_, func_);
+    } else {
+      trans(ctx_, y_, y_ + nx_,
+            MidWiseTransformIterator<T, DeviceContext>(x_, n, post), z_, func_);
+    }
+  }
+ private:
+  const T *x_;
+  const T *y_;
+  OutType *z_;
+  int64_t nx_;
+  const DeviceContext &ctx_;
+  Functor func_;
+  bool is_xsize_larger_;
+};
 template <typename T>
 struct AddFunctor {
    inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
@@ -114,38 +227,45 @@ public:
        if (in_x_t->dims() == in_y_t->dims()) {
            mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->add(in_x_t, in_y_t, out_t);
        } else {
-            Tensor in_x_t_slice;
+          Tensor in_x_t_slice;
-            Tensor in_y_t_slice;
+          Tensor in_y_t_slice;
-            Tensor out_t_slice;
+          Tensor out_t_slice;
-            for (size_t i = 0; i < SHARE_NUM; ++i) {
+          for (size_t i = 0; i < SHARE_NUM; ++i) {
-                in_x_t_slice = in_x_t->Slice(i, i + 1);
+            in_x_t_slice = in_x_t->Slice(i, i + 1);
-                in_y_t_slice = in_y_t->Slice(i, i + 1);
+            in_y_t_slice = in_y_t->Slice(i, i + 1);
-                out_t_slice = out_t->Slice(i, i + 1);
+            out_t_slice = out_t->Slice(i, i + 1);
-                auto x_dims = in_x_t_slice.dims();
-                auto y_dims = in_y_t_slice.dims();
-                axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+            auto x_dims = in_x_t_slice.dims();
-                PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), 
+            auto y_dims = in_y_t_slice.dims();
-                               "Axis should be in range [0, x_dims)");
-                int pre, n, post;
+            axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-                GetMidDims get_mid_dims;
-                get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+            PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                PADDLE_ENFORCE_EQ(post, 1, 
+                           "Axis should be in range [0, x_dims)");
-                                  "post should be equal 1, but received post is [%s]", post);
+            int pre, n, post;
-                auto x_ = in_x_t_slice.data<T>();
+            GetMidDims get_mid_dims;
-                auto y_ = in_y_t_slice.data<T>();
+            get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
-                auto out_ = out_t_slice.data<T>();
-                auto nx_ = in_x_t_slice.numel();
+            auto x_ = in_x_t_slice.data<T>();
-                paddle::platform::Transform<DeviceContext> trans;
+            auto y_ = in_y_t_slice.data<T>();
+            auto out_ = out_t_slice.data<T>();
+            auto nx_ = in_x_t_slice.numel();
+            paddle::platform::Transform<DeviceContext> trans;
+            if (post == 1) {
                trans(ctx.template device_context<DeviceContext>(), x_, x_ + nx_, 
-                     RowwiseTransformIterator<T, DeviceContext>(y_, n),
+                    RowwiseTransformIterator<T, DeviceContext>(y_, n),
-                     out_, AddFunctor<T>());
+                    out_, AddFunctor<T>());
+            } else {
+                trans(ctx.template device_context<DeviceContext>(), x_, x_ + nx_, 
+                    MidWiseTransformIterator<T, DeviceContext>(y_, n, post),
+                    out_, AddFunctor<T>());
            }
        }
+      }
  }
 };
@@ -185,17 +305,15 @@ public:
                int pre, n, post;
                GetMidDims get_mid_dims;
                get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
-                PADDLE_ENFORCE_EQ(post, 1, 
-                                  "post should be equal 1, but received post is [%s]", post);
+                std::fill(dy_data, dy_data + dy->numel(), static_cast<T>(0));
                for (size_t i = 0; i < SHARE_NUM; ++i) {
                    int y_offset = i * n;
                    for (size_t j = 0; j < pre; ++j) {
                        for (size_t k = 0; k < n; ++k) {
-                            int out_offset = i * pre * n + j * n + k;
+                            for (size_t m = 0; m < post; ++m) {
-                            if (0 == j) {
+                                int out_offset = i * pre * n * post + j * n * post + k * post + m;
-                                dy_data[k + y_offset] = dout_data[out_offset];
-                            } else {
                                dy_data[k + y_offset] += dout_data[out_offset];
                            }
                        }

--- a/core/paddlefl_mpc/operators/mpc_gru_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_gru_op.cc
--- a/core/paddlefl_mpc/operators/mpc_gru_op.h
+++ b/core/paddlefl_mpc/operators/mpc_gru_op.h
--- a/core/paddlefl_mpc/operators/mpc_lookup_table_v2_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_lookup_table_v2_op.cc
--- a/core/paddlefl_mpc/operators/mpc_lookup_table_v2_op.h
+++ b/core/paddlefl_mpc/operators/mpc_lookup_table_v2_op.h
--- a/core/paddlefl_mpc/operators/mpc_mul_op.h
+++ b/core/paddlefl_mpc/operators/mpc_mul_op.h
@@ -150,6 +150,7 @@ public:
        if (dx) {
            dx->mutable_data<T>(ctx.GetPlace());
+            auto dx_dim = dx->dims();
            if (dx->dims().size() > 3) {
                dx->Resize({2, x_mat_width, x_mat_height});
            }
@@ -160,7 +161,6 @@ public:
            // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
            mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->matmul(
                &dout_matrix, &y_matrix_trans, dx);  
-            auto dx_dim = dx->dims();
            if (dx_dim.size() > 3) {
                dx->Resize(dx_dim);
            }
@@ -168,6 +168,7 @@ public:
        if (dy) {
            dy->mutable_data<T>(ctx.GetPlace());
+            auto dy_dim = dy->dims();
            if (dy->dims().size() > 3) {
                dy->Resize({2, y_mat_width, y_mat_height});
            }
@@ -179,7 +180,6 @@ public:
            // dy = x' * dout. dy K x N, dout : M x N, x : M x K
            mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->matmul(
                &x_matrix_trans, &dout_matrix, dy);  
-            auto dy_dim = dy->dims();
            if (dy_dim.size() > 3) {
                dy->Resize(dy_dim);
            }

--- a/core/paddlefl_mpc/operators/mpc_pool_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_pool_op.cc
--- a/core/paddlefl_mpc/operators/mpc_pool_op.h
+++ b/core/paddlefl_mpc/operators/mpc_pool_op.h
--- a/core/paddlefl_mpc/operators/mpc_relu_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_relu_op.cc
--- a/core/paddlefl_mpc/operators/mpc_relu_op.h
+++ b/core/paddlefl_mpc/operators/mpc_relu_op.h
--- a/core/paddlefl_mpc/operators/mpc_sgd_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_sgd_op.cc
--- a/core/paddlefl_mpc/operators/mpc_sgd_op.h
+++ b/core/paddlefl_mpc/operators/mpc_sgd_op.h
--- a/core/paddlefl_mpc/operators/mpc_softmax_with_cross_entropy_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_softmax_with_cross_entropy_op.cc
--- a/core/paddlefl_mpc/operators/mpc_softmax_with_cross_entropy_op.h
+++ b/core/paddlefl_mpc/operators/mpc_softmax_with_cross_entropy_op.h
--- a/core/privc3/CMakeLists.txt
+++ b/core/privc3/CMakeLists.txt
--- a/core/privc3/boolean_tensor.h
+++ b/core/privc3/boolean_tensor.h
--- a/core/privc3/boolean_tensor_impl.h
+++ b/core/privc3/boolean_tensor_impl.h
--- a/core/privc3/boolean_tensor_test.cc
+++ b/core/privc3/boolean_tensor_test.cc
--- a/core/privc3/circuit_context.h
+++ b/core/privc3/circuit_context.h
--- a/core/privc3/fixedpoint_tensor.h
+++ b/core/privc3/fixedpoint_tensor.h
--- a/core/privc3/fixedpoint_tensor_imp.h
+++ b/core/privc3/fixedpoint_tensor_imp.h
--- a/core/privc3/fixedpoint_tensor_test.cc
+++ b/core/privc3/fixedpoint_tensor_test.cc
--- a/core/privc3/fixedpoint_util.h
+++ b/core/privc3/fixedpoint_util.h
--- a/core/privc3/fixedpoint_util_test.cc
+++ b/core/privc3/fixedpoint_util_test.cc
--- a/core/privc3/tensor_adapter.h
+++ b/core/privc3/tensor_adapter.h
--- a/core/psi/CMakeLists.txt
+++ b/core/psi/CMakeLists.txt
--- a/core/psi/aes.cc
+++ b/core/psi/aes.cc
--- a/core/psi/aes.h
+++ b/core/psi/aes.h
--- a/core/psi/aes_test.cc
+++ b/core/psi/aes_test.cc
--- a/core/psi/prng.h
+++ b/core/psi/prng.h
--- a/core/psi/psi_api.cc
+++ b/core/psi/psi_api.cc
--- a/core/psi/psi_api.h
+++ b/core/psi/psi_api.h
--- a/core/psi/psi_api_test.cc
+++ b/core/psi/psi_api_test.cc
--- a/core/psi/sse_transpose.cc
+++ b/core/psi/sse_transpose.cc
--- a/python/paddle_fl/mpc/__init__.py
+++ b/python/paddle_fl/mpc/__init__.py
--- a/python/paddle_fl/mpc/backward.py
+++ b/python/paddle_fl/mpc/backward.py
--- a/python/paddle_fl/mpc/data_utils/aby3.py
+++ b/python/paddle_fl/mpc/data_utils/aby3.py
--- a/python/paddle_fl/mpc/data_utils/alignment.py
+++ b/python/paddle_fl/mpc/data_utils/alignment.py
--- a/python/paddle_fl/mpc/initializer.py
+++ b/python/paddle_fl/mpc/initializer.py
--- a/python/paddle_fl/mpc/input.py
+++ b/python/paddle_fl/mpc/input.py
--- a/python/paddle_fl/mpc/layers/__init__.py
+++ b/python/paddle_fl/mpc/layers/__init__.py
--- a/python/paddle_fl/mpc/layers/conv.py
+++ b/python/paddle_fl/mpc/layers/conv.py
--- a/python/paddle_fl/mpc/layers/ml.py
+++ b/python/paddle_fl/mpc/layers/ml.py
--- a/python/paddle_fl/mpc/layers/rnn.py
+++ b/python/paddle_fl/mpc/layers/rnn.py
--- a/python/paddle_fl/mpc/mpc_layer_helper.py
+++ b/python/paddle_fl/mpc/mpc_layer_helper.py
--- a/python/paddle_fl/mpc/optimizer.py
+++ b/python/paddle_fl/mpc/optimizer.py
--- a/python/paddle_fl/mpc/tests/unittests/op_test.py
+++ b/python/paddle_fl/mpc/tests/unittests/op_test.py
--- a/python/paddle_fl/mpc/tests/unittests/run_test_example.sh
+++ b/python/paddle_fl/mpc/tests/unittests/run_test_example.sh
--- a/python/paddle_fl/mpc/tests/unittests/test_datautils_align.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_datautils_align.py
--- a/python/paddle_fl/mpc/tests/unittests/test_input_embedding.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_input_embedding.py
--- a/python/paddle_fl/mpc/tests/unittests/test_op_add.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_add.py
--- a/python/paddle_fl/mpc/tests/unittests/test_op_batch_norm.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_batch_norm.py
--- a/python/paddle_fl/mpc/tests/unittests/test_op_compare.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_compare.py
--- a/python/paddle_fl/mpc/tests/unittests/test_op_conv.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_conv.py
--- a/python/paddle_fl/mpc/tests/unittests/test_op_dynamic_gru.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_dynamic_gru.py
--- a/python/paddle_fl/mpc/tests/unittests/test_op_pool.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_pool.py
--- a/python/paddle_fl/mpc/tests/unittests/test_op_softmax_with_cross_entropy.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_softmax_with_cross_entropy.py
--- a/python/paddle_fl/mpc/tests/unittests/testsuite.py
+++ b/python/paddle_fl/mpc/tests/unittests/testsuite.py