Merge branch 'master' of https://github.com/PaddlePaddle/PaddleFL into refactor_context

Conflicts: core/paddlefl_mpc/mpc_protocol/abstract_context.h core/paddlefl_mpc/mpc_protocol/aby3_operators.h core/privc3/boolean_tensor.h core/privc3/boolean_tensor_impl.h core/privc3/fixedpoint_tensor.h

Merge branch 'master' of https://github.com/PaddlePaddle/PaddleFL into refactor_context
Conflicts: core/paddlefl_mpc/mpc_protocol/abstract_context.h core/paddlefl_mpc/mpc_protocol/aby3_operators.h core/privc3/boolean_tensor.h core/privc3/boolean_tensor_impl.h core/privc3/fixedpoint_tensor.h
b8d8ee2b · yangqingyou · 1228e002 · 7eb82e3f · b8d8ee2b · b8d8ee2b
126 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.15)

 project(PaddleEncrypted)

-add_compile_options(-msse4.2 -maes -fPIC -DPADDLE_WITH_MKLDNN)
+add_compile_options(-msse4.2 -fPIC -DPADDLE_WITH_MKLDNN -O2)

 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(CMAKE_CXX_STANDARD 11)
@@ -34,8 +34,8 @@ execute_process(COMMAND ${PYTHON} -c "import paddle;print(paddle.version.full_ve
  RESULT_VARIABLE ret OUTPUT_VARIABLE paddle_version OUTPUT_STRIP_TRAILING_WHITESPACE)

 if (NOT ret)
-  if (NOT ${paddle_version} STREQUAL "1.8.0")
-    message(FATAL_ERROR "Paddle installation of 1.8.0 is required but ${paddle_version} is found")
+  if (NOT ${paddle_version} STRGREATER_EQUAL "1.8.0")
+    message(FATAL_ERROR "Paddle installation of >= 1.8.0 is required but ${paddle_version} is found")
  endif()
 else()
  message(FATAL_ERROR "Could not get paddle version.")
@@ -57,6 +57,10 @@ option(WITH_TESTING "Compile with unit testing" ON)

 option(WITH_PSI "Compile with psi lib" ON)

+option(USE_AES_NI "Compile with AES NI" ON)
+
+option(USE_OPENMP "Compile with OpenMP" ON)
+
 ########################### the project build part ###############################
 message(STATUS "Using paddlepaddle installation of ${paddle_version}")
 message(STATUS "paddlepaddle include directory: ${PADDLE_INCLUDE}")
@@ -70,6 +74,15 @@ include_directories(.)
 include_directories(${PADDLE_INCLUDE})
 include_directories(${PADDLE_INCLUDE}/third_party)

+if (USE_AES_NI)
+    add_compile_definitions(USE_AES_NI)
+    add_compile_options(-maes)
+endif (USE_AES_NI)
+
+if (USE_OPENMP)
+    add_compile_options(-fopenmp)
+    find_package(OpenMP REQUIRED)
+endif(USE_OPENMP)

 add_subdirectory(core/privc3)
 add_subdirectory(core/paddlefl_mpc/mpc_protocol)

--- a/README.md
+++ b/README.md

 <img src='https://github.com/PaddlePaddle/PaddleFL/blob/master/docs/source/_static/FL-logo.png' width = "400" height = "160">

-[DOC](https://paddlefl.readthedocs.io/en/latest/) | [Quick Start](https://paddlefl.readthedocs.io/en/latest/instruction.html) | [中文](./README_cn.md)
+[DOC](https://paddlefl.readthedocs.io/en/latest/) | [Quick Start](https://paddlefl.readthedocs.io/en/latest/compile_and_intall.html) | [中文](./README_cn.md)

 PaddleFL is an open source federated learning framework based on PaddlePaddle. Researchers can easily replicate and compare different federated learning algorithms with PaddleFL. Developers can also benefit from PaddleFL in that it is easy to deploy a federated learning system in large scale distributed clusters. In PaddleFL, serveral federated learning strategies will be provided with application in computer vision, natural language processing, recommendation and so on. Application of traditional machine learning training strategies such as Multi-task learning, Transfer Learning in Federated Learning settings will be provided. Based on PaddlePaddle's large scale distributed training and elastic scheduling of training job on Kubernetes, PaddleFL can be easily deployed based on full-stack open sourced software.

@@ -42,7 +42,7 @@ We **highly recommend** to run PaddleFL in Docker
 ```sh
 #Pull and run the docker
 docker pull hub.baidubce.com/paddlefl/paddle_fl:latest
-docker run --name <docker_name> --net=host -it -v $PWD:/root <image id> /bin/bash
+docker run --name <docker_name> --net=host -it -v $PWD:/paddle <image id> /bin/bash

 #Install paddle_fl
 pip install paddle_fl

--- a/README_cn.md
+++ b/README_cn.md
@@ -39,7 +39,7 @@ PaddleFL 中主要提供两种解决方案：**Data Parallel** 以及 **Federate
 ```sh
 #Pull and run the docker
 docker pull hub.baidubce.com/paddlefl/paddle_fl:latest
-docker run --name <docker_name> --net=host -it -v $PWD:/root <image id> /bin/bash
+docker run --name <docker_name> --net=host -it -v $PWD:/paddle <image id> /bin/bash

 #Install paddle_fl
 pip install paddle_fl

--- a/core/paddlefl_mpc/data_utils/CMakeLists.txt
+++ b/core/paddlefl_mpc/data_utils/CMakeLists.txt
-add_compile_options(-msse4.2 -maes)
- 
 set(PYBIND_SRCS
    "./data_utils.cc"
 )

--- a/core/paddlefl_mpc/data_utils/data_utils.cc
+++ b/core/paddlefl_mpc/data_utils/data_utils.cc
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #include <atomic>
 #include <set>
@@ -21,8 +21,8 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>

-#include "core/paddlefl_mpc/mpc_protocol/aby3_operators.h"
 #include "core/privc3/fixedpoint_util.h"
+#include "core/paddlefl_mpc/mpc_protocol/aby3_operators.h"
 #include "core/psi/psi_api.h"

 namespace py = pybind11;
@@ -30,12 +30,13 @@ namespace py = pybind11;
 namespace aby3 {

 // split plaintext into three shares.
-template <typename T, size_t N> py::array_t<T> share(double input) {
+template<typename T, size_t N>
+py::array_t<T> share(double input) {
    size_t share_num = 3;
    auto shares = py::array_t<T>(share_num);
    py::buffer_info shares_buf = shares.request();
-  T *shares_buf_ptr = (T *)shares_buf.ptr;
-  T *ret_ptr[share_num];
+    T* shares_buf_ptr = (T*)shares_buf.ptr;
+    T* ret_ptr[share_num];
    for (size_t i = 0; i < share_num; ++i) {
        ret_ptr[i] = &shares_buf_ptr[i];
    }
@@ -46,10 +47,11 @@ template <typename T, size_t N> py::array_t<T> share(double input) {
 }

 // combine three shares to reveal plaintext.
-template <typename T, size_t N> double reveal(py::array_t<T> shares) {
+template<typename T, size_t N>
+double reveal(py::array_t<T> shares) {
    size_t share_num = 3;
    py::buffer_info shares_buf = shares.request();
-  T *shares_buf_ptr = (T *)shares_buf.ptr;
+    T *shares_buf_ptr = (T *) shares_buf.ptr;
    T *ret[share_num];

    for (size_t idx = 0; idx < share_num; ++idx) {
@@ -62,14 +64,15 @@ template <typename T, size_t N> double reveal(py::array_t<T> shares) {
 }

 // call psi_send
-int send_psi(int port, const std::set<std::string> &input) {
+int send_psi(int port, const std::set<std::string>& input) {
    std::atomic<int> prog(0);
    return psi::psi_send(port, input, &prog);
 }

 // call psi_recv
-std::vector<std::string> recv_psi(const std::string &remote_ip, int port,
-                                  const std::set<std::string> &input) {
+std::vector<std::string> recv_psi(const std::string &remote_ip,
+                                  int port,
+                                  const std::set<std::string>& input) {
    std::vector<std::string> output;
    std::atomic<int> prog(0);
    int ret = psi::psi_recv(remote_ip, port, input, &output, &prog);
@@ -80,7 +83,8 @@ std::vector<std::string> recv_psi(const std::string &remote_ip, int port,
    return output;
 }

-PYBIND11_MODULE(mpc_data_utils, m) {
+PYBIND11_MODULE(mpc_data_utils, m)
+{
    // optional module docstring
    m.doc() = "pybind11 paddle-mpc plugin: data_utils (share, reveal, psi)";

@@ -90,8 +94,11 @@ PYBIND11_MODULE(mpc_data_utils, m) {
          "combine three shares to reveal plaintext.");

    m.def("send_psi", &send_psi, "Send input in two party PSI.");
-  m.def("recv_psi", &recv_psi,
-        "Send input and return PSI result as output in two party PSI.");
+    m.def("recv_psi", &recv_psi, "Send input and return PSI result as output in two party PSI.");
+
+    m.attr("mpc_one_share") = (1 << paddle::mpc::ABY3_SCALING_FACTOR) / 3;
 }

 }  // namespace aby3
+
+
--- a/core/paddlefl_mpc/mpc_protocol/CMakeLists.txt
+++ b/core/paddlefl_mpc/mpc_protocol/CMakeLists.txt
-add_compile_options(-msse4.2 -maes)
-
 set(PROTO_SRCS
    "./aby3_protocol.cc"
    "./mesh_network.cc"
@@ -17,3 +15,5 @@ target_link_libraries(mpc_protocol fluid_framework gloo hiredis privc3)
 cc_test(mesh_network_test SRCS mesh_network_test.cc DEPS mpc_protocol)
 cc_test(mpc_protocol_test SRCS mpc_protocol_test.cc DEPS mpc_protocol)
 cc_test(mpc_instance_test SRCS mpc_instance_test.cc DEPS mpc_protocol)
+
+
--- a/core/paddlefl_mpc/mpc_protocol/abstract_context.h
+++ b/core/paddlefl_mpc/mpc_protocol/abstract_context.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 #pragma once

-#include <algorithm>
 #include <algorithm>
 #include <memory>

@@ -30,13 +29,6 @@ using PseudorandomNumberGenerator = psi::PseudorandomNumberGenerator;

 class AbstractContext {
 public:
-/*
-  AbstractContext(size_t party, std::shared_ptr<AbstractNetwork> network,
-                 const block &seed = psi::g_zero_block,
-                 const block &seed2 = psi::g_zero_block) {
-    init(party, network, seed, seed2);
-  }
-*/
  AbstractContext() = default;
  AbstractContext(const AbstractContext &other) = delete;

@@ -53,7 +45,7 @@ public:
  }

  void set_num_party(size_t num_party) {
-    PADDLE_ENFORCE_TRUE(num_party == 2 || num_party == 3,
+    PADDLE_ENFORCE_EQ(num_party == 2 || num_party == 3, true,
                     "2 or 3 party protocol is supported.");
    _num_party = num_party;
  }
@@ -177,10 +169,9 @@ public:
 private:
  size_t _num_party;
  size_t _party;
-
  std::shared_ptr<AbstractNetwork> _network;
-
  PseudorandomNumberGenerator _prng[3];
+
 };

 } // namespace mpc

--- a/core/paddlefl_mpc/mpc_protocol/aby3_operators.h
+++ b/core/paddlefl_mpc/mpc_protocol/aby3_operators.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 // Description: implementations of each virtual op according to ABY3 protocol

@@ -24,6 +24,7 @@
 #include "core/privc3/boolean_tensor.h"
 #include "core/privc3/aby3_context.h"
 #include "core/privc3/fixedpoint_tensor.h"
+#include "core/privc3/boolean_tensor.h"
 #include "core/privc3/paddle_tensor.h"

 namespace paddle {
@@ -32,13 +33,14 @@ namespace mpc {
 using paddle::framework::Tensor;
 using aby3::ABY3Context;
 // TODO: decide scaling factor
-const size_t ABY3_SCALING_FACTOR = 16;
+const size_t ABY3_SCALING_FACTOR = FIXED_POINTER_SCALING_FACTOR;
 using FixedTensor = aby3::FixedPointTensor<int64_t, ABY3_SCALING_FACTOR>;
 using BoolTensor = aby3::BooleanTensor<int64_t>;
 using PaddleTensor = aby3::PaddleTensor<int64_t>;

 class Aby3OperatorsImpl : public MpcOperators {
 public:
+
    void add(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {

        auto lhs_tuple = from_tensor(lhs);
@@ -50,6 +52,7 @@ public:
        auto out_ = std::get<0>(out_tuple).get();

        lhs_->add(rhs_, out_);
+
    }

    // TODO: override
@@ -122,8 +125,7 @@ public:
        auto out_ = std::get<0>(out_tuple).get();

        PaddleTensor scale_tensor(ContextHolder::device_ctx());
-    scale_tensor.from_float_point_scalar(factor, lhs_->shape(),
-                                         ABY3_SCALING_FACTOR);
+        scale_tensor.from_float_point_scalar(factor, lhs_->shape(), ABY3_SCALING_FACTOR);

        lhs_->mul(&scale_tensor, out_);
    }
@@ -138,6 +140,18 @@ public:
        op_->relu(out_);
    }

+    void relu_with_derivative(const Tensor *op, Tensor *out, Tensor *derivative) override {
+        auto op_tuple = from_tensor(op);
+        auto out_tuple = from_tensor(out);
+        auto der_tuple = from_tensor<BoolTensor>(derivative);
+
+        auto op_ = std::get<0>(op_tuple).get();
+        auto out_ = std::get<0>(out_tuple).get();
+        auto der_ = std::get<0>(der_tuple).get();
+
+        op_->relu_with_derivative(out_, der_);
+    }
+
    void sigmoid(const Tensor *op, Tensor *out) override {
        auto op_tuple = from_tensor(op);
        auto out_tuple = from_tensor(out);
@@ -148,14 +162,34 @@ public:
        op_->sigmoid(out_);
    }

-  void softmax(const Tensor *op, Tensor *out) override {
+    void sigmoid_enhanced(const Tensor *op, Tensor *out) override {
+        auto op_tuple = from_tensor(op);
+        auto out_tuple = from_tensor(out);
+
+        auto op_ = std::get<0>(op_tuple).get();
+        auto out_ = std::get<0>(out_tuple).get();
+
+        op_->sigmoid_enhanced(out_);
+    }
+
+    void sigmoid_chebyshev(const Tensor *op, Tensor *out) override {
+        auto op_tuple = from_tensor(op);
+        auto out_tuple = from_tensor(out);
+
+        auto op_ = std::get<0>(op_tuple).get();
+        auto out_ = std::get<0>(out_tuple).get();
+
+        op_->sigmoid_chebyshev(out_);
+    }
+
+    void softmax(const Tensor *op, Tensor *out, bool use_relu, bool use_long_div) override {
        auto op_tuple = from_tensor(op);
        auto out_tuple = from_tensor(out);

        auto op_ = std::get<0>(op_tuple).get();
        auto out_ = std::get<0>(out_tuple).get();

-    op_->softmax(out_);
+        op_->softmax(out_, use_relu, use_long_div);
    }

    void gt(const Tensor *lhs, const Tensor *rhs, Tensor *out) override {
@@ -239,8 +273,8 @@ public:
                       out->data<int64_t>(), [](int64_t b) { return 1 - b; });
    }

-  void relu_grad(const Tensor *y, const Tensor *dy, Tensor *dx,
-                 float point = 0.0f) override {
+    void relu_grad(const Tensor *y, const Tensor *dy,
+                   Tensor *dx, float point = 0.0f) override {

        auto y_tuple = from_tensor(y);

@@ -248,8 +282,7 @@ public:

        PaddleTensor point_(ContextHolder::device_ctx());

-    point_.from_float_point_scalar<float>(point, y_->shape(),
-                                          ABY3_SCALING_FACTOR);
+        point_.from_float_point_scalar<float>(point, y_->shape(), ABY3_SCALING_FACTOR);

        auto tmp0 = ContextHolder::tensor_factory()->create_int64_t(y_->shape());
        auto tmp1 = ContextHolder::tensor_factory()->create_int64_t(y_->shape());
@@ -267,24 +300,77 @@ public:
        bool_out.mul(dy_, out_);
    }

+    void arith_bool_mul(const Tensor* op_a, const Tensor* op_b, Tensor* out) override {
+
+        auto a_tuple = from_tensor(op_a);
+        auto a_ = std::get<0>(a_tuple).get();
+
+        auto b_tuple = from_tensor<BoolTensor>(op_b);
+        auto b_ = std::get<0>(b_tuple).get();
+
+        auto out_tuple = from_tensor(out);
+        auto out_ = std::get<0>(out_tuple).get();
+
+        b_->mul(a_, out_);
+    }
+
+    void max_pooling(const Tensor* in, Tensor* out, Tensor* pos_info) override {
+
+        auto a_tuple = from_tensor(in);
+        auto a_ = std::get<0>(a_tuple).get();
+
+        auto b_tuple = from_tensor<BoolTensor>(pos_info);
+        auto b_ = std::get<0>(b_tuple).get();
+
+        auto out_tuple = from_tensor(out);
+        auto out_ = std::get<0>(out_tuple).get();
+
+        a_->max_pooling(out_, b_);
+    }
+
+    void inverse_square_root(const Tensor* in, Tensor* out) override {
+        auto x_tuple = from_tensor(in);
+        auto x_ = std::get<0>(x_tuple).get();
+
+        auto y_tuple = from_tensor(out);
+        auto y_ = std::get<0>(y_tuple).get();
+
+        x_->inverse_square_root(y_);
+    }
+
 private:
-  std::tuple<std::shared_ptr<FixedTensor>, std::shared_ptr<PaddleTensor>,
-             std::shared_ptr<PaddleTensor>>
-  from_tensor(const Tensor *t) {
+    template <typename T>
+    std::tuple<
+        std::shared_ptr<T>,
+        std::shared_ptr<PaddleTensor>,
+        std::shared_ptr<PaddleTensor> > from_tensor(const Tensor* t) {

            PADDLE_ENFORCE_EQ(t->dims()[0], 2);

-    auto pt0 = std::make_shared<PaddleTensor>(ContextHolder::device_ctx(),
-                                              t->Slice(0, 1));
-    auto pt1 = std::make_shared<PaddleTensor>(ContextHolder::device_ctx(),
-                                              t->Slice(1, 2));
+            auto pt0 = std::make_shared<PaddleTensor>(ContextHolder::device_ctx(), t->Slice(0, 1));
+            auto pt1 = std::make_shared<PaddleTensor>(ContextHolder::device_ctx(), t->Slice(1, 2));

-    aby3::TensorAdapter<int64_t> *pt_array[2] = {pt0.get(), pt1.get()};
+           // remove leading 1 in shape
+           auto shape = pt0->shape();
+           shape.erase(shape.begin());
+           pt0->reshape(shape);
+           pt1->reshape(shape);

-    auto ft = std::make_shared<FixedTensor>(pt_array);
+            aby3::TensorAdapter<int64_t>* pt_array[2] = {pt0.get(), pt1.get()};
+
+            auto ft = std::make_shared<T>(pt_array);

        return std::make_tuple(ft, pt0, pt1);
    }
+
+    std::tuple<
+        std::shared_ptr<FixedTensor>,
+        std::shared_ptr<PaddleTensor>,
+        std::shared_ptr<PaddleTensor> > from_tensor(const Tensor* t) {
+
+        return from_tensor<FixedTensor>(t);
+    }
+
 };

 } // mpc

--- a/core/paddlefl_mpc/mpc_protocol/mpc_operators.h
+++ b/core/paddlefl_mpc/mpc_protocol/mpc_operators.h
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 // Description:
 // abstract mpc operation interface
@@ -24,6 +24,9 @@ namespace mpc {

 using paddle::framework::Tensor;

+// TODO: decide scaling factor
+const size_t FIXED_POINTER_SCALING_FACTOR = 16;
+
 class MpcOperators {
 public:
    virtual void add(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;
@@ -42,9 +45,16 @@ public:

    virtual void relu(const Tensor *op, Tensor *out) = 0;

+    virtual void relu_with_derivative(const Tensor *op, Tensor *out,
+                                      Tensor *derivative) = 0;
+
    virtual void sigmoid(const Tensor *op, Tensor *out) = 0;

-  virtual void softmax(const Tensor *op, Tensor *out) = 0;
+    virtual void sigmoid_enhanced(const Tensor *op, Tensor *out) = 0;
+
+    virtual void sigmoid_chebyshev(const Tensor *op, Tensor *out) = 0;
+
+    virtual void softmax(const Tensor *op, Tensor *out, bool use_relu, bool use_long_div) = 0;

    virtual void gt(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;

@@ -58,9 +68,23 @@ public:

    virtual void neq(const Tensor *lhs, const Tensor *rhs, Tensor *out) = 0;

-  virtual void relu_grad(const Tensor *y, const Tensor *dy, Tensor *dx,
-                         const float point) = 0;
+    virtual void relu_grad(const Tensor *y, const Tensor *dy, Tensor *dx, const float point) = 0;
+
+    // arithmetic tensor mult boolean tensor, element-wisely
+    // see [ABY3, sec 5.4.1]
+    // for aby3 only
+    // example (in plaintext):
+    // [1, 2, 3, 4] * [0, 0, 1, 0] = [0, 0, 3, 0]
+    virtual void arith_bool_mul(const Tensor* op_a, const Tensor* op_b, Tensor* out) {}
+
+    // max pooling in which shape of filter is nx1
+    // pos_info keeps which element is max in a col, for backward grad
+    // for filter in other shape, reshape input first
+    virtual void max_pooling(const Tensor* in, Tensor* out, Tensor* pos_info) {}
+
+    virtual void inverse_square_root(const Tensor* in, Tensor* out) = 0;
 };

 } // mpc
 } // paddle
+
--- a/core/paddlefl_mpc/operators/CMakeLists.txt
+++ b/core/paddlefl_mpc/operators/CMakeLists.txt
-add_compile_options(-msse4.2 -maes)
-
 aux_source_directory(. DIR_SRCS)
-add_library(mpc_ops_o OBJECT ${DIR_SRCS})
+aux_source_directory(./math MATH_SRCS)
+add_library(mpc_ops_o OBJECT ${DIR_SRCS} ${MATH_SRCS})
 add_dependencies(mpc_ops_o fluid_framework gloo)

 add_library(mpc_ops STATIC $<TARGET_OBJECTS:mpc_ops_o>)

--- a/core/paddlefl_mpc/operators/conv_op.cc
+++ b/core/paddlefl_mpc/operators/conv_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "./conv_op.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+std::vector<int64_t> ConvOp::ComputeOutputShape(
+    framework::InferShapeContext* ctx) const {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Conv");
+    OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "Conv");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::string padding_algorithm =
+        ctx->Attrs().Get<std::string>("padding_algorithm");
+    int groups = ctx->Attrs().Get<int>("groups");
+    std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+    const std::string data_format = ctx->Attrs().Get<std::string>("data_format");
+
+    // MKL-DNN Kernels are using NCHW order of dims description
+    // so we ignore data_format consideration for MKL-DNN kernel
+    const bool channel_last = (this->IsMKLDNNType() == false) &&
+        (data_format == "NHWC" || data_format == "NDHWC");
+
+    PADDLE_ENFORCE_EQ(
+        // 1 for share dim
+        in_dims.size() == 4 + 1 || in_dims.size() == 5 + 1, true,
+        platform::errors::InvalidArgument(
+            "The input of Op(Conv) should be a 4-D or 5-D Tensor. But "
+            "received: input's dimension is %u, input's shape is [%s].",
+            in_dims.size(), in_dims));
+
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), filter_dims.size(),
+        platform::errors::InvalidArgument(
+            "The input's dimension and filter's dimension of "
+            "Op(Conv) should be equal. But received: the input's shape is [%s], "
+            "the input's dimension is %d; the filter's shape is [%s],  "
+            "the filter's dimension is %d.",
+            in_dims, in_dims.size(), filter_dims, filter_dims.size()));
+
+    int in_sub_stride_size = in_dims.size() - strides.size();
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), strides.size() + 2U + 1,
+        platform::errors::InvalidArgument(
+            "The difference of input's dimension and Attr(strides)'s "
+            "length must be euqal to 2 for Op(Conv). "
+            "But received: input's dimension is %d, input's shape is [%s]; "
+            "Attr(stride)'s length is %d, Attr(stride) is [%s]; "
+            "difference of input's dimention and Attr(strides)'s length = %u.",
+            in_dims.size(), in_dims, strides.size(),
+            framework::make_ddim(strides), in_sub_stride_size));
+
+    const auto input_channels =
+        channel_last ? in_dims[in_dims.size() - 1] : in_dims[1 + 1];
+
+    PADDLE_ENFORCE_EQ(
+        input_channels, filter_dims[1 + 1] * groups,
+        platform::errors::InvalidArgument(
+            "The number of input's channels should be equal to filter's channels "
+            "* groups for Op(Conv). But received: the input's channels is %d, "
+            "the input's shape is [%s]; the filter's channels is %d, the "
+            "filter's shape is [%s]; the groups is %d, the data_format is %s. "
+            "The error may come from wrong data_format setting.",
+            input_channels, in_dims, filter_dims[1 + 1], filter_dims, groups,
+            data_format));
+    PADDLE_ENFORCE_EQ(
+        filter_dims[0 + 1] % groups, 0,
+        platform::errors::InvalidArgument(
+            "The number of output's channels (filter's first dimension) of "
+            "Op(Conv) should be divided by groups. But received: "
+            "the output channels is %d, the filter's shape is [%s], "
+            "the groups is %d.",
+            filter_dims[0 + 1], filter_dims, groups));
+
+    framework::DDim in_data_dims;
+    if (channel_last) {
+        in_data_dims = framework::slice_ddim(in_dims, 1 + 1, in_dims.size() - 1);
+    } else {
+        in_data_dims = framework::slice_ddim(in_dims, 2 + 1, in_dims.size());
+    }
+
+    framework::DDim filter_data_dims =
+        framework::slice_ddim(filter_dims, 2 + 1, filter_dims.size());
+
+    std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    std::vector<int64_t> output_shape({in_dims[0], in_dims[1]});
+    if (!channel_last) {
+        output_shape.push_back(filter_dims[0 + 1]);
+    }
+    for (int i = 0; i < in_data_dims.size(); ++i) {
+        if ((!ctx->IsRuntime()) &&
+            (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) {
+            output_shape.push_back(-1);
+        } else {
+            output_shape.push_back(
+                ConvOutputSize(in_data_dims[i], filter_data_dims[i], dilations[i],
+                               paddings[2 * i], paddings[2 * i + 1], strides[i]));
+        }
+    }
+    if (channel_last) {
+        output_shape.push_back(filter_dims[1]);
+    }
+
+    return output_shape;
+}
+
+framework::OpKernelType ConvOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+    int customized_type_value =
+        framework::OpKernelType::kDefaultCustomizedTypeValue;
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input");
+    std::string data_format =
+        "AnyLayout";  // todo enable data layout when it's ready
+    framework::DataLayout layout = framework::StringToDataLayout(data_format);
+
+    if (input_data_type != framework::proto::VarType::INT8 &&
+        input_data_type != framework::proto::VarType::UINT8) {
+        auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
+        PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
+                          platform::errors::InvalidArgument(
+                              "input and filter data type should be consistent"));
+    }
+    if (input_data_type == framework::proto::VarType::FP16) {
+        PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
+                          platform::errors::InvalidArgument(
+                              "float16 can only be used when CUDNN is used"));
+    }
+
+    auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                        library, customized_type_value);
+    return type;
+}
+
+framework::OpKernelType ConvOp::GetKernelTypeForVar(
+    const std::string& var_name, const Tensor& tensor,
+    const framework::OpKernelType& expected_kernel_type) const {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+}
+
+void Conv2DOpMaker::Make() {
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddInput("Input",
+             "(Tensor) The input tensor of convolution operator. "
+             "The format of input tensor is NCHW or NHWC, where N is batch size, "
+             "C is the "
+             "number of channels, H is the height of the feature, "
+             "and W is the width of the feature.");
+    AddInput("Filter",
+             "(Tensor) The filter tensor of convolution operator. "
+             "The format of the filter tensor is MCHW, where M is the number of "
+             "output image channels, C is the number of input image channels, "
+             "H is the height of the filter, and W is the width of the filter. "
+             "If the groups attribute is greater than 1, C equals the number of "
+             "input image channels divided by the groups.");
+    AddInput("Bias",
+             "(Tensor) Bias to be added to each output of filter application."
+             "The format of output tensor is X (one-dimensional) of size equal"
+             "to the number of output channels. Only used with MKL-DNN.")
+        .AsDispensable();
+    AddOutput("Output",
+              "(Tensor) The output tensor of convolution operator. "
+              "It has same data fromat and data type as the Input.");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int> default:{1, 1}), the "
+                              "strides(h_stride, w_stride) of "
+                              "convolution operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector<int> default:{0, 0}), the "
+                              "paddings(pad_height_top, pad_height_bottom, "
+                              "pad_width_left, pad_wifth_right)  of "
+                              "convolution operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>(
+        "padding_algorithm",
+        "(string, default \"EXPLICIT\") An optional string from: \"EXPLICIT\","
+        "\"SAME\",\"VALID\". Set to \"EXPLICIT\" for explicit padding. "
+        "Set to \"SAME\" or \"VALID\" for algorithm of padding. ")
+        .SetDefault("EXPLICIT");
+    AddAttr<int>(
+        "groups",
+        "(int default:1), the groups number of the convolution operator. "
+        "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
+        "when group=2, the first half of the filters is only connected to the "
+        "first half of the input channels, while the second half of the filters "
+        "is only connected to the second half of the input channels.")
+        .SetDefault(1);
+    AddAttr<std::vector<int>>("dilations",
+                              "(vector<int> default:{1, 1}), the "
+                              "dilations(h_dilation, w_dilation) of "
+                              "convolution operator.")
+        .SetDefault({1, 1});
+    AddAttr<bool>("use_quantizer",
+                  "(bool, default false) "
+                  "Set to true for operators that should be quantized and use "
+                  "int8 kernel. "
+                  "Only used on CPU.")
+        .SetDefault(false);
+    AddAttr<float>("Scale_in",
+                   "Scale_in to be used for int8 input data."
+                   "Only used with MKL-DNN INT8.")
+        .SetDefault(1.0f);
+    AddAttr<float>("Scale_out",
+                   "Scale_out to be used for int8 output data."
+                   "Only used with MKL-DNN INT8.")
+        .SetDefault(1.0f);
+    AddAttr<float>("Scale_in_eltwise",
+                   "Scale_in_eltwise to be used for int8 eltwise input data."
+                   "Only used with MKL-DNN INT8.")
+        .SetDefault(1.0f);
+    AddAttr<std::vector<float>>("Scale_weights",
+                                "Scale_weights to be used for int8 weights data."
+                                "Only used with MKL-DNN INT8.")
+        .SetDefault({1.0f});
+    AddAttr<bool>("force_fp32_output",
+                  "(bool, default false) Force INT8 kernel output FP32, only "
+                  "used in MKL-DNN INT8")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("NCHW");
+    // TODO(dzhwinter): need to registered layout transform function
+    AddAttr<bool>("exhaustive_search",
+                  "(bool, default false) cuDNN has many algorithm to calculation "
+                  "convolution, whether enable exhaustive search "
+                  "for cuDNN convolution or not, default is False.")
+        .SetDefault(false);
+
+    AddComment(R"DOC(
+Convolution Operator.
+
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and Output(Output) are in NCHW or NHWC format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and W is
+the width of the feature.
+Filters(Input) is MCHW format format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+$$
+       H_{out}= \frac{(H_{in} + pad_height_top + pad_height_bottom - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+       W_{out}= \frac{(W_{in} + pad_width_left + pad_width_right - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
+)DOC");
+        Apply();
+}
+
+void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    if (ctx->HasOutput(framework::GradVarName("Input"))) {
+        ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+        ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+    }
+}
+
+framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+    int customized_type_value =
+        framework::OpKernelType::kDefaultCustomizedTypeValue;
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    std::string data_format = "AnyLayout";
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
+    auto type = framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace(),
+        layout_, library_, customized_type_value);
+    return type;
+}
+
+framework::OpKernelType ConvOpGrad::GetKernelTypeForVar(
+    const std::string& var_name, const Tensor& tensor,
+    const framework::OpKernelType& expected_kernel_type) const {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+}
+
+template <typename T>
+class Conv2DGradMaker : public framework::SingleGradOpMaker<T> {
+public:
+    using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+    void Apply(GradOpPtr<T> op) const override {
+        op->SetType(this->ForwardOpType() + "_grad");
+        op->SetInput("Input", this->Input("Input"));
+        op->SetInput("Filter", this->Input("Filter"));
+        op->SetInput("Bias", this->Input("Bias"));
+        op->SetInput(framework::GradVarName("Output"), this->OutputGrad("Output"));
+
+        op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
+        op->SetOutput(framework::GradVarName("Filter"), this->InputGrad("Filter"));
+        op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+        op->SetAttrMap(this->Attrs());
+    }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mpc_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  ops::ConvOpInferVarType,
+                  ops::Conv2DGradMaker<paddle::framework::OpDesc>,
+                  ops::Conv2DGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(mpc_conv2d_grad, ops::ConvOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    mpc_conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    mpc_conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/core/paddlefl_mpc/operators/conv_op.h
+++ b/core/paddlefl_mpc/operators/conv_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "./math/im2col.h"
+#include "./math/vol2col.h"
+#include "./math/math_function.h"
+#include "mpc_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+constexpr int kConvMKLDNNFP32 = 1;
+constexpr int kConvMKLDNNINT8 = 2;
+constexpr int MaxKeyLength = 256;
+
+// Base convolution operator definations for other conv
+// like operators to reuse the implementation.
+inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                          int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  PADDLE_ENFORCE_GT(
+      output_size, 0,
+      platform::errors::InvalidArgument(
+          "The output's size is expected to be greater than 0. "
+          "But recieved: output's size is %d. The output's size is computed by "
+          "((input_size + 2 * padding - (dilation * (filter_size - 1) + 1)) / "
+          "stride + 1), where input_size is %d, padding is %d, "
+          "filter_size is %d, dilation is %d, stride is %d.",
+          output_size, input_size, padding, filter_size, dilation, stride));
+
+  return output_size;
+}
+
+inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                          int padding_1, int padding_2, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + padding_1 + padding_2 - dkernel) / stride + 1;
+  PADDLE_ENFORCE_GT(
+      output_size, 0,
+      platform::errors::InvalidArgument(
+          "The output's size is expected to be greater than 0. "
+          "But recieved: output's size is %d. The output's size is computed by "
+          "((input_size + padding_1 + padding_2 - (dilation * (filter_size - "
+          "1) + 1)) / stride + 1), where input_size is %d, padding is "
+          "(%d, %d), filter_size is %d, dilation is %d, stride is %d.",
+          output_size, input_size, padding_1, padding_2, filter_size, dilation,
+          stride));
+
+  return output_size;
+}
+
+template <typename T = int>
+inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
+                                     std::vector<T>* dilation,
+                                     const std::string& padding_algorithm,
+                                     const framework::DDim data_dims,
+                                     const std::vector<T>& strides,
+                                     const std::vector<T>& ksize) {
+  // set padding size == data_dims.size() * 2
+  auto data_shape = framework::vectorize<T>(data_dims);
+  if (static_cast<int>(paddings->size()) == data_dims.size()) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(
+        data_dims.size() * 2, paddings->size(),
+        platform::errors::InvalidArgument(
+            "Attribute padding's size should be the same or twice as the "
+            "input's dimension. "
+            "But recieved: padding's size is %d, padding is [%s]; input's "
+            "dimension is %d, input's shape is [%s].",
+            paddings->size(), framework::make_ddim(*paddings), data_dims.size(),
+            data_dims));
+  }
+
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
+      T pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i],
+                   static_cast<T>(0));
+      T pad_0 = pad_sum / 2;
+      T pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+
+      // dilation
+      *(dilation->begin() + i) = 1;
+    }
+
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    // extra 1 for share dim
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2 + 1]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  if (paddings.size() != strides.size()) {
+    for (size_t j = 0; j < paddings.size(); ++j) {
+      padding_0 = padding_0 && (paddings[j] == 0);
+    }
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
+                                 const Tensor* input,
+                                 Tensor* transformed_input) {
+  // extra 1 for leading share dim S
+  int dim = input->dims().size() - 2 - 1;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    // SNDHWC -> NCSDHW
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[0] = input->dims()[1];
+    in_dims_vec[1] = input->dims()[5];
+    in_dims_vec[2] = input->dims()[0];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    in_dims_vec[5] = input->dims()[4];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    // SNHWC -> NCSHW
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[0] = input->dims()[1];
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[0];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const framework::ExecutionContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  // extra 1 for leading share dim S
+  int dim = input->dims().size() - 2 - 1;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    // NCSDHW -> SNDHWC
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[0] = input->dims()[2];
+    in_dims_vec[1] = input->dims()[0];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[5];
+    in_dims_vec[5] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    // NCSHW -> SNHWC
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[0] = input->dims()[2];
+    in_dims_vec[1] = input->dims()[0];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToShareLast(const framework::ExecutionContext& context,
+                                        const Tensor* input,
+                                        Tensor* transformed_input) {
+    transformed_input->Resize(input->dims());
+
+    // SNC.. -> NCS..
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[0] = input->dims()[1];
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[0];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToShareFirst(const framework::ExecutionContext& context,
+                                        const Tensor* input,
+                                        Tensor* transformed_input) {
+    transformed_input->Resize(input->dims());
+
+    // NCS.. -> SNC..
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[0] = input->dims()[2];
+    in_dims_vec[1] = input->dims()[0];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelFirst(const framework::ExecutionContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  // extra 1 for leading share dim
+  // swap share and batch_size
+  int dim = input->dims().size() - 2 - 1;
+  if (dim == 3) {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> axis{1, 5, 0, 2, 3, 4};
+    math::Transpose<DeviceContext, T, 6> trans6;
+    trans6(dev_ctx, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> axis{1, 4, 0, 2, 3};
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(dev_ctx, *input, transformed_input, axis);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelLast(const framework::ExecutionContext& context,
+                               const Tensor* input, Tensor* transformed_input) {
+  // extra 1 for leading share dim
+  // swap share and batch_size
+  int dim = input->dims().size() - 2 - 1;
+  if (dim == 3) {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> axis{2, 0, 3, 4, 5, 1};
+    math::Transpose<DeviceContext, T, 6> trans6;
+    trans6(dev_ctx, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> axis{2, 0, 3, 4, 1};
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(dev_ctx, *input, transformed_input, axis);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToShareFirst(const framework::ExecutionContext& context,
+                              const Tensor* input, Tensor* transformed_input) {
+  int dim = input->dims().size();
+
+  PADDLE_ENFORCE_GT(
+      dim, 4,
+      platform::errors::InvalidArgument(
+          "The input's dim is expected to be greater than 4."));
+
+  std::vector<int> axis(dim);
+  for (size_t i = 3; i < dim; ++i) {
+      axis[i] = i;
+  }
+  // share
+  axis[0] = 2;
+  // N
+  axis[1] = 0;
+  // C
+  axis[2] = 1;
+
+  auto& dev_ctx = context.template device_context<DeviceContext>();
+
+  switch(dim) {
+
+  case 5:
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(dev_ctx, *input, transformed_input, axis);
+    break;
+
+  case 6:
+    math::Transpose<DeviceContext, T, 6> trans6;
+    trans6(dev_ctx, *input, transformed_input, axis);
+    break;
+
+  default:
+    PADDLE_ENFORCE_LT(
+        dim, 7, platform::errors::InvalidArgument(
+            "The input's dim greater than 6 not supported yet. "));
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToShareLast(const framework::ExecutionContext& context,
+                              const Tensor* input, Tensor* transformed_input) {
+  int dim = input->dims().size();
+
+  PADDLE_ENFORCE_GT(
+      dim, 4,
+      platform::errors::InvalidArgument(
+          "The input's dim is expected to be greater than 4."));
+
+  std::vector<int> axis(dim);
+  for (size_t i = 3; i < dim; ++i) {
+      axis[i] = i;
+  }
+  // SNC -> NCS
+  axis[0] = 1;
+  axis[1] = 2;
+  axis[2] = 0;
+
+  auto& dev_ctx = context.template device_context<DeviceContext>();
+
+  switch(dim) {
+
+  case 5:
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(dev_ctx, *input, transformed_input, axis);
+    break;
+
+  case 6:
+    math::Transpose<DeviceContext, T, 6> trans6;
+    trans6(dev_ctx, *input, transformed_input, axis);
+    break;
+
+  default:
+    PADDLE_ENFORCE_LT(
+        dim, 7, platform::errors::InvalidArgument(
+            "The input's dim greater than 6 not supported yet. "));
+  }
+}
+template <typename DeviceContext, typename T>
+inline void TransToBatchFirst(const framework::ExecutionContext& context,
+                              const Tensor* input, Tensor* transformed_input) {
+  int dim = input->dims().size();
+
+  PADDLE_ENFORCE_GT(
+      dim, 4,
+      platform::errors::InvalidArgument(
+          "The input's dim is expected to be greater than 4."));
+
+  std::vector<int> axis(dim);
+  for (size_t i = 3; i < dim; ++i) {
+      axis[i] = i;
+  }
+  // N
+  axis[0] = 1;
+  // C
+  axis[1] = 2;
+  // share
+  axis[2] = 0;
+
+  auto& dev_ctx = context.template device_context<DeviceContext>();
+
+  switch(dim) {
+
+  case 5:
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(dev_ctx, *input, transformed_input, axis);
+    break;
+
+  case 6:
+    math::Transpose<DeviceContext, T, 6> trans6;
+    trans6(dev_ctx, *input, transformed_input, axis);
+    break;
+
+  default:
+    PADDLE_ENFORCE_LT(
+        dim, 7, platform::errors::InvalidArgument(
+            "The input's dim greater than 6 not supported yet. "));
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToSwapedLeadingDims(const framework::ExecutionContext& context,
+                                      const Tensor* input,
+                                Tensor* transformed_input) {
+    transformed_input->Resize(input->dims());
+
+    // NS.. -> SN..
+    // or CS.. -> SC..
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[0] = input->dims()[1];
+    in_dims_vec[1] = input->dims()[0];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+}
+
+template <typename DeviceContext, typename T>
+void TransToSwapedLeadingDims(const framework::ExecutionContext& context,
+                       const Tensor* input,
+                       Tensor* output){
+    output->Resize(input->dims());
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[0] = input->dims()[1];
+    in_dims_vec[1] = input->dims()[0];
+    output->Resize(framework::make_ddim(in_dims_vec));
+    output->mutable_data<T>(context.GetPlace());
+
+    const int dim = input->dims().size();
+
+    std::vector<int> axis(dim);
+    for (size_t i = 0; i < dim; ++i) {
+        axis[i] = i;
+    }
+    axis[0] = 1;
+    axis[1] = 0;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    switch(dim) {
+
+    case 3:
+      math::Transpose<DeviceContext, T, 3> trans3;
+      trans3(dev_ctx, *input, output, axis);
+      break;
+
+    case 4:
+      math::Transpose<DeviceContext, T, 4> trans4;
+      trans4(dev_ctx, *input, output, axis);
+      break;
+
+    case 5:
+      math::Transpose<DeviceContext, T, 5> trans5;
+      trans5(dev_ctx, *input, output, axis);
+      break;
+
+    case 6:
+      math::Transpose<DeviceContext, T, 6> trans6;
+      trans6(dev_ctx, *input, output, axis);
+      break;
+
+    default:
+      PADDLE_ENFORCE_GT(
+          dim, 2, platform::errors::InvalidArgument(
+              "The input's dim less than 3 not supported yet. "));
+      PADDLE_ENFORCE_LT(
+          dim, 7, platform::errors::InvalidArgument(
+              "The input's dim greater than 6 not supported yet. "));
+    }
+    return;
+}
+
+template <typename DeviceContext, typename T, typename Func>
+void SharesToCols(const framework::ExecutionContext& context,
+      const Tensor* input,
+      const std::vector<int>& dilations,
+      const std::vector<int>& strides,
+      const std::vector<int>& paddings,
+      Tensor* col, Func data2col) {
+    // // input: CSHW or CSDHW, S for share dim
+
+    framework::DDim in_plain_dim =
+        framework::slice_ddim(input->dims(), 1, input->dims().size());
+    framework::DDim col_plain_dim =
+        framework::slice_ddim(col->dims(), 1, col->dims().size());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    const int share_size = input->dims()[0];
+    for (size_t i = 0; i < share_size; ++i) {
+        Tensor share = input->Slice(i, i + 1).Resize(in_plain_dim);
+        Tensor col_share = col->Slice(i, i + 1).Resize(col_plain_dim);
+        data2col(dev_ctx, share, dilations, strides, paddings, &col_share);
+    }
+}
+
+template <typename DeviceContext, typename T>
+Tensor SwapedLeadingDims(const framework::ExecutionContext& context,
+                         const Tensor* input) {
+    Tensor output(input->type());
+
+    ResizeToSwapedLeadingDims<DeviceContext, T>(context, input,
+                                                &output);
+    TransToSwapedLeadingDims<DeviceContext, T>(context, input,
+                                               &output);
+    return output;
+}
+
+template <typename DeviceContext, typename T>
+Tensor TransposeMpcMat(const framework::ExecutionContext& context,
+                       const Tensor* input) {
+    Tensor output(input->type());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+
+    PADDLE_ENFORCE_EQ(
+        in_dims_vec.size(), 3, platform::errors::InvalidArgument(
+            "The input's dim should be 3. "));
+    in_dims_vec[0] = input->dims()[0];
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    output.Resize(framework::make_ddim(in_dims_vec));
+    output.mutable_data<T>(context.GetPlace());
+
+    std::vector<int> axis(3);
+    axis[0] = 0;
+    axis[1] = 2;
+    axis[2] = 1;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    math::Transpose<DeviceContext, T, 3> trans3;
+    trans3(dev_ctx, *input, &output, axis);
+
+    return output;
+}
+
+// Define Op classes in .h file so that other conv
+// operator implementations can reuse the code.
+class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final;
+
+ protected:
+  virtual void Apply() {}
+};
+
+class ConvOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{
+        {"Input", /*->*/ "Output"}};
+    return m;
+  }
+};
+
+class ConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    std::vector<int64_t> output_shape = ComputeOutputShape(ctx);
+
+    OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "Conv");
+    ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+    ctx->ShareLoD("Input", "Output");
+  }
+
+ protected:
+  std::vector<int64_t> ComputeOutputShape(
+      framework::InferShapeContext* ctx) const;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override;
+};
+
+class ConvOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override;
+};
+
+// TODO: add conv double grad
+
+template <typename DeviceContext, typename T>
+class GemmConvKernel : public MpcOpKernel<T> {
+ public:
+  void ComputeImpl(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // The filter will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    const int groups = context.Attr<int>("groups");
+    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    const std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+    const std::string data_format = context.Attr<std::string>("data_format");
+    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+    Tensor transformed_input(input->type());
+    Tensor transformed_output(output->type());
+
+    if (channel_last) {
+      ResizeToChannelFirst<DeviceContext, T>(context, input,
+                                             &transformed_input);
+      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
+
+      ResizeToChannelFirst<DeviceContext, T>(context, output,
+                                                   &transformed_output);
+
+    } else {
+      ResizeToShareLast<DeviceContext, T>(context, input,
+                                          &transformed_input);
+      TransToShareLast<DeviceContext, T>(context, input, &transformed_input);
+
+      ResizeToShareLast<DeviceContext, T>(context, output,
+                                          &transformed_output);
+    }
+
+    // update padding and dilation
+    auto trans_in_dims = transformed_input.dims();
+    auto filter_dims = filter.dims();
+
+    // extra 1 for share dim
+    framework::DDim in_data_dims =
+        framework::slice_ddim(trans_in_dims, 2 + 1, trans_in_dims.size());
+    framework::DDim filter_data_dims =
+        framework::slice_ddim(filter_dims, 2 + 1, filter_dims.size());
+
+    std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    const int batch_size = static_cast<int>(transformed_input.dims()[0]);
+
+    // filter_shape_vec:
+    // {k_share, k_o, k_i, k_h, k_w} or {k_share, k_o, k_i, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+    // output_shape_vec:
+    // {o_n, o_c, o_share, o_h, o_w} or {o_n, o_c, o_share, o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(
+        framework::vectorize(transformed_output.dims()));
+
+    // use col_shape in the im2col calculation
+    // col_shape_vec:
+    // {i_s, i_c/g, k_h, k_w, o_h, o_w} or {i_s, i_c/g, k_d, k_h, k_w,
+    // o_d, o_h, o_w}
+    size_t data_dim = filter_shape_vec.size() - 2 - 1;
+
+    std::vector<int64_t> col_shape_vec(2 + 2 * data_dim);
+    col_shape_vec[0] = trans_in_dims[2];
+    col_shape_vec[1] = trans_in_dims[1] / groups;
+
+    std::vector<int64_t> col_matrix_shape_vec(3);
+    col_matrix_shape_vec[0] = col_shape_vec[0];
+    col_matrix_shape_vec[1] = col_shape_vec[1];
+    col_matrix_shape_vec[2] = 1;
+    // use col_matrix_shape in the gemm calculation
+    // size:
+    // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
+    // o_w)
+
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 2] = filter_shape_vec[j + 3];
+      col_shape_vec[j + 2 + data_dim] = output_shape_vec[j + 3];
+      col_matrix_shape_vec[1] *= filter_shape_vec[j + 3];
+      col_matrix_shape_vec[2] *= output_shape_vec[j + 3];
+    }
+
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+
+    framework::DDim col_matrix_shape(framework::make_ddim(col_matrix_shape_vec));
+
+    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    if (is_expand) {
+      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+    }
+
+    // with share dim
+    framework::DDim in_matrix_shape = framework::slice_ddim(
+        transformed_input.dims(), 1, transformed_input.dims().size());
+
+    // SOIHW or SOIDHW
+    framework::DDim filter_matrix_shape = {filter.dims()[0], filter.dims()[1],
+                                           filter.numel() / (filter.dims()[0] * filter.dims()[1]) };
+    filter.Resize(filter_matrix_shape);
+
+    // OSIHW or OSIDHW
+    Tensor filter_ = SwapedLeadingDims<DeviceContext, T>(context, &filter);
+
+    // CS(H * W) or CS(D * H * W)
+    framework::DDim output_matrix_shape = {
+        transformed_output.dims()[1],
+        transformed_output.dims()[2],
+        transformed_output.numel() /
+            (transformed_output.dims()[0]
+             * transformed_output.dims()[1]
+             * transformed_output.dims()[2])};
+
+    // convolution operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
+    int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
+
+    math::Vol2ColFunctor<DeviceContext, T> vol2col;
+    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+
+    for (int i = 0; i < batch_size; i++) {
+      Tensor in_batch =
+          transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
+      Tensor out_batch =
+          transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
+
+      for (int g = 0; g < groups; g++) {
+        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+        Tensor in_slice_ = SwapedLeadingDims<DeviceContext, T>(context, &in_slice);
+
+        if (!is_expand) {
+          col.ShareDataWith(in_slice_);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          SharesToCols<DeviceContext, T>(context, &in_slice_, dilations, strides,
+                 std::vector<int>{paddings[0], paddings[2], paddings[1],
+                 paddings[3]}, &col, im2col);
+        } else if (data_dim == 3U) {
+          SharesToCols<DeviceContext, T>(context, &in_slice_, dilations, strides, paddings, &col, vol2col);
+        }
+
+        // gemm
+        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+        Tensor filter_slice = filter_.Slice(g * out_step, (g + 1) * out_step);
+        Tensor out_slice_ = SwapedLeadingDims<DeviceContext, T>(context, &out_slice);
+        Tensor filter_slice_ = SwapedLeadingDims<DeviceContext, T>(context, &filter_slice);
+
+        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->matmul(
+                        &filter_slice_, &col_matrix, &out_slice_);
+
+        TransToSwapedLeadingDims<DeviceContext, T>(context, &out_slice_,
+                                                   &out_slice);
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<DeviceContext, T>(context, &transformed_output,
+                                           output);
+    } else {
+      TransToShareFirst<DeviceContext, T>(context, &transformed_output,
+                                          output);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GemmConvGradKernel : public MpcOpKernel<T> {
+ public:
+  void ComputeImpl(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    // The filter and filter_grad will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    int groups = context.Attr<int>("groups");
+    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    const std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+    const std::string data_format = context.Attr<std::string>("data_format");
+
+    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+    Tensor transformed_input(input->type());
+    Tensor transformed_output_grad(output_grad->type());
+
+    if (channel_last) {
+      ResizeToChannelFirst<DeviceContext, T>(context, input,
+                                             &transformed_input);
+      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
+
+      ResizeToChannelFirst<DeviceContext, T>(context, output_grad,
+                                             &transformed_output_grad);
+      TransToChannelFirst<DeviceContext, T>(context, output_grad,
+                                            &transformed_output_grad);
+    } else {
+      ResizeToShareLast<DeviceContext, T>(context, input,
+                                             &transformed_input);
+      TransToShareLast<DeviceContext, T>(context, input, &transformed_input);
+      ResizeToShareLast<DeviceContext, T>(context, output_grad,
+                                          &transformed_output_grad);
+      TransToShareLast<DeviceContext, T>(context, output_grad, &transformed_output_grad);
+    }
+
+    // update padding and dilation
+    auto in_dims = transformed_input.dims();
+    auto filter_dims = filter.dims();
+    // extra 1 for share dim
+    framework::DDim in_data_dims =
+        framework::slice_ddim(in_dims, 2 + 1, in_dims.size());
+    framework::DDim filter_data_dims =
+        framework::slice_ddim(filter_dims, 2 + 1, filter_dims.size());
+    std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    const int batch_size = static_cast<int>(transformed_input.dims()[0]);
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    // filter_shape_vec: {k_share, k_o, k_i, k_h, k_w} or {k_share, k_o, k_i, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    // output_shape_vec: {o_n, o_c, o_share, o_h, o_w} or {o_n, o_c, o_share, o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(
+        framework::vectorize(transformed_output_grad.dims()));
+
+    // use col_shape in the im2col calculation
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+    // o_h, o_w}
+    size_t data_dim = filter_shape_vec.size() - 2 - 1;
+    std::vector<int64_t> col_shape_vec(2 + 2 * data_dim);
+    col_shape_vec[0] = in_dims[2];
+    col_shape_vec[1] = in_dims[1] / groups;
+
+    std::vector<int64_t> col_matrix_shape_vec(3);
+    col_matrix_shape_vec[0] = col_shape_vec[0];
+    col_matrix_shape_vec[1] = col_shape_vec[1];
+    col_matrix_shape_vec[2] = 1;
+    // use col_matrix_shape in the gemm calculation
+    // size:
+    // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
+    // o_w)
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 2] = filter_shape_vec[j + 3];
+      col_shape_vec[j + 2 + data_dim] = output_shape_vec[j + 3];
+      col_matrix_shape_vec[1] *= filter_shape_vec[j + 3];
+      col_matrix_shape_vec[2] *= output_shape_vec[j + 3];
+    }
+
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+    framework::DDim col_matrix_shape(framework::make_ddim(col_matrix_shape_vec));
+
+    // with share dim
+    framework::DDim input_shape = framework::slice_ddim(
+        transformed_input.dims(), 1, transformed_input.dims().size());
+
+    // SOIHW or SOIDHW
+    framework::DDim filter_matrix_shape = {filter.dims()[0], filter.dims()[1],
+                                           filter.numel() / (filter.dims()[0] * filter.dims()[1]) };
+
+    // OSIHW or OSIDHW
+    framework::DDim filter_matrix_shape_ = {filter.dims()[1], filter.dims()[0],
+                                           filter.numel() / (filter.dims()[0] * filter.dims()[1]) };
+    filter.Resize(filter_matrix_shape);
+
+    Tensor filter_ = SwapedLeadingDims<DeviceContext, T>(context, &filter);
+
+    // CS(H * W) or CS(D * H * W)
+    framework::DDim output_matrix_shape = {
+        transformed_output_grad.dims()[1],
+        transformed_output_grad.dims()[2],
+        transformed_output_grad.numel() /
+            (transformed_output_grad.dims()[0]
+             * transformed_output_grad.dims()[1]
+             * transformed_output_grad.dims()[2])};
+    // convolution backward input operator:  gemm + col2im(or col2vol)
+    // convolution backward weight operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
+    int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
+
+    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    if (is_expand) {
+      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+    }
+
+    math::SetConstant<DeviceContext, T> set_zero;
+
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      Tensor transformed_input_grad(input_grad->type());
+      if (channel_last) {
+        ResizeToChannelFirst<DeviceContext, T>(context, input_grad,
+                                               &transformed_input_grad);
+
+      } else {
+        ResizeToShareLast<DeviceContext, T>(context, input_grad,
+                                            &transformed_input_grad);
+      }
+      // if is_expand is false, the operation of set_zero is unnecessary,
+      // because math::matmul will reset input_grad.
+      if (is_expand) {
+        set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
+      }
+      math::Col2VolFunctor<DeviceContext, T> col2vol;
+      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
+
+      for (int i = 0; i < batch_size; i++) {
+        Tensor out_grad_batch =
+            transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_grad_batch =
+            transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
+        for (int g = 0; g < groups; g++) {
+          // gemm
+          Tensor out_grad_slice =
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor filter_slice = filter_.Slice(g * out_step, (g + 1) * out_step);
+
+          Tensor in_grad_slice =
+              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          Tensor in_grad_slice_ = SwapedLeadingDims<DeviceContext, T>(context, &in_grad_slice);
+          if (!is_expand) {
+            col_matrix.ShareDataWith(in_grad_slice);
+            col_matrix.Resize(col_matrix_shape);
+          }
+          Tensor filter_slice_ = SwapedLeadingDims<DeviceContext, T>(context, &filter_slice);
+          Tensor out_grad_slice_ = SwapedLeadingDims<DeviceContext, T>(context, &out_grad_slice);
+          Tensor filter_slice_t = TransposeMpcMat<DeviceContext, T>(context, &filter_slice_);
+          mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->matmul(
+              &filter_slice_t, &out_grad_slice_, &col_matrix);
+
+          if (is_expand && data_dim == 2U) {
+            SharesToCols<DeviceContext, T>(context, &col, dilations, strides,
+                   std::vector<int>{paddings[0], paddings[2], paddings[1],
+                                    paddings[3]},
+                   &in_grad_slice_, col2im);
+          } else if (is_expand && data_dim == 3U) {
+              SharesToCols<DeviceContext, T>(context, &col, dilations, strides, paddings, &in_grad_slice_, col2vol);
+          }
+          TransToSwapedLeadingDims<DeviceContext, T>(context, &in_grad_slice_,
+                                                     &in_grad_slice);
+        }
+      }
+      if (channel_last) {
+        TransToChannelLast<DeviceContext, T>(context, &transformed_input_grad,
+                                             input_grad);
+      } else {
+        TransToShareFirst<DeviceContext, T>(context, &transformed_input_grad,
+                                            input_grad);
+      }
+    }
+
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(context.GetPlace());
+      auto filter_grad_dims = filter_grad->dims();
+
+      Tensor filter_grad_ = SwapedLeadingDims<DeviceContext, T>(context, filter_grad);
+      filter_grad_.Resize(filter_matrix_shape_);
+
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+      math::Vol2ColFunctor<DeviceContext, T> vol2col;
+      for (int i = 0; i < batch_size; i++) {
+        Tensor out_grad_batch =
+            transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_batch = transformed_input.Slice(i, i + 1).Resize(input_shape);
+        for (int g = 0; g < groups; g++) {
+          // im2col
+          Tensor out_grad_slice =
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          Tensor in_slice_ = SwapedLeadingDims<DeviceContext, T>(context, &in_slice);
+          if (!is_expand) {
+            col.ShareDataWith(in_slice_);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            SharesToCols<DeviceContext, T>(context, &in_slice_, dilations, strides,
+                                           std::vector<int>{paddings[0], paddings[2], paddings[1],
+                                           paddings[3]}, &col, im2col);
+
+          } else if (data_dim == 3U) {
+            SharesToCols<DeviceContext, T>(context, &in_slice_, dilations, strides, paddings, &col, vol2col);
+          }
+
+          Tensor out_grad_slice_ = SwapedLeadingDims<DeviceContext, T>(context, &out_grad_slice);
+          Tensor col_mat_t = TransposeMpcMat<DeviceContext, T>(context, &col_matrix);
+          // gemm
+          Tensor filter_grad_slice =
+              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
+          Tensor filter_grad_slice_ = SwapedLeadingDims<DeviceContext, T>(context, &filter_grad_slice);
+          mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->matmul(&out_grad_slice_, &col_mat_t, &filter_grad_slice_);
+          TransToSwapedLeadingDims<DeviceContext, T>(context, &filter_grad_slice_,
+                                                     &filter_grad_slice);
+        }
+      }
+      TransToSwapedLeadingDims<DeviceContext, T>(context, &filter_grad_,
+                                                 filter_grad);
+      filter_grad->Resize(filter_grad_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/concat_and_split.cc
+++ b/core/paddlefl_mpc/operators/math/concat_and_split.cc
+/* Copyright (c) 2020 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "core/paddlefl_mpc/operators/math/concat_and_split.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatFunctor<platform::CPUDeviceContext, T> {
+public:
+    void operator()(const platform::CPUDeviceContext& context,
+                    const std::vector<framework::Tensor>& input, int axis,
+                    framework::Tensor* output) {
+        // TODO(zcd): Add input data validity checking
+        int num = input.size();
+
+        int rows = 1;
+        auto dim_0 = input[0].dims();
+        for (int i = 0; i < axis; ++i) {
+            rows *= dim_0[i];
+        }
+        int out_rows = rows, out_cols = 0;
+
+        std::vector<int64_t> input_cols(input.size());
+        for (int i = 0; i < num; ++i) {
+            int t_cols = input[i].numel() / rows;
+            out_cols += t_cols;
+            input_cols[i] = t_cols;
+        }
+        auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+
+        // computation
+        auto output_data = output->data<T>();
+        int col_idx = 0;
+        for (int j = 0; j < num; ++j) {
+            int col_len = input_cols[j];
+            auto input_data = input[j].data<T>();
+            for (int k = 0; k < out_rows; ++k) {
+                memory::Copy(cpu_place, output_data + k * out_cols + col_idx, cpu_place,
+                             input_data + k * col_len, sizeof(T) * col_len);
+            }
+            col_idx += col_len;
+        }
+    }
+};
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class SplitFunctor<platform::CPUDeviceContext, T> {
+public:
+    void operator()(const platform::CPUDeviceContext& context,
+                    const framework::Tensor& input,
+                    const std::vector<const framework::Tensor*>& ref_inputs,
+                    const int axis, std::vector<framework::Tensor*>* outputs) {
+        // TODO(zcd): Add input data validity checking
+        size_t num = outputs->size();
+
+        int input_rows = 1;
+        auto dim_0 = ref_inputs[0]->dims();
+        for (int i = 0; i < axis; ++i) {
+            input_rows *= dim_0[i];
+        }
+
+        int input_cols = 0;
+
+        std::vector<int64_t> output_cols(outputs->size());
+        for (size_t i = 0; i < num; ++i) {
+            int t_cols = ref_inputs[i]->numel() / input_rows;
+            input_cols += t_cols;
+            output_cols[i] = t_cols;
+        }
+        auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+
+        // computation
+        for (int k = 0; k < input_rows; ++k) {
+            const T* src_ptr = input.data<T>() + k * input_cols;
+            int col_idx = 0;
+            for (size_t j = 0; j < num; ++j) {
+                int col_len = output_cols[j];
+                auto* out_tensor = outputs->at(j);
+                if (out_tensor != nullptr) {
+                    T* dst_ptr = out_tensor->data<T>() + k * col_len;
+                    memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+                                 sizeof(T) * col_len);
+                }
+                col_idx += col_len;
+            }
+        }
+    }
+};
+#define DEFINE_FUNCTOR(type)                                      \
+  template class ConcatFunctor<platform::CPUDeviceContext, type>; \
+  template class SplitFunctor<platform::CPUDeviceContext, type>;
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/concat_and_split.h
+++ b/core/paddlefl_mpc/operators/math/concat_and_split.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
+ *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
+ */
+template <typename DeviceContext, typename T>
+class ConcatFunctor {
+public:
+    void operator()(const DeviceContext& context,
+                    const std::vector<framework::Tensor>& input, int axis,
+                    framework::Tensor* output);
+};
+
+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
+template <typename DeviceContext, typename T>
+class SplitFunctor {
+public:
+    void operator()(const DeviceContext& context, const framework::Tensor& input,
+                    const std::vector<const framework::Tensor*>& ref_inputs,
+                    int axis, std::vector<framework::Tensor*>* outputs);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
+
+#define FOR_ALL_TYPES(macro) \
+  macro(int64_t);            \
+
--- a/core/paddlefl_mpc/operators/math/im2col.cc
+++ b/core/paddlefl_mpc/operators/math/im2col.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "./im2col.h"
+#include <vector>
+#include "./im2col_cfo_cpu.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col,
+                  const DataLayout data_layout) {
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(col->dims().size(), 5,
+                      "The dimension of col should be 5.");
+
+    if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
+        dilation[1] == 1) {
+      if (padding[0] == 0 && padding[1] == 0 && padding[2] == 0 &&
+          padding[3] == 0) {
+        im2col_sh1sw1dh1dw1ph0pw0<T>(im, col, data_layout);
+        return;
+      } else if (padding[0] == 1 && padding[1] == 1 && padding[2] == 1 &&
+                 padding[3] == 1) {
+        im2col_sh1sw1dh1dw1ph1pw1<T>(im, col, data_layout);
+        return;
+      }
+      // TODO(TJ): complete padding >=2
+    }
+    im2col_common<T>(im, dilation, stride, padding, col, data_layout);
+  }
+};
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im,
+                  const DataLayout data_layout) {
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(col.dims().size(), 5,
+                      "The dimension of col should be 5.");
+    int im_channels =
+        (data_layout != DataLayout::kNHWC ? im->dims()[0] : im->dims()[2]);
+    int im_height =
+        (data_layout != DataLayout::kNHWC ? im->dims()[1] : im->dims()[0]);
+    int im_width =
+        (data_layout != DataLayout::kNHWC ? im->dims()[2] : im->dims()[1]);
+    int filter_height = col.dims()[1];
+    int filter_width = col.dims()[2];
+    int col_height = col.dims()[3];
+    int col_width = col.dims()[4];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       ((dilation[0] * (filter_height - 1) + 1))) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       ((dilation[1] * (filter_width - 1) + 1))) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+
+    int channels_col = im_channels * filter_height * filter_width;
+
+    T* im_data = im->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / (filter_width * filter_height);
+      for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
+              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
+            int im_offset;
+            if (data_layout != DataLayout::kNHWC) {
+              im_offset =
+                  (c_im * im_height + im_row_idx) * im_width + im_col_idx;
+            } else {
+              im_offset =
+                  (im_row_idx * im_width + im_col_idx) * im_channels + c_im;
+            }
+            im_data[im_offset] +=
+                col_data[(c * col_height + h) * col_width + w];
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, int64_t>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, int64_t>;
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col,
+                  const DataLayout data_layout) {
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(col->dims().size(), 5,
+                      "The dimension of col should be 5.");
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[3];
+    int filter_width = col->dims()[4];
+    int col_height = col->dims()[0];
+    int col_width = col->dims()[1];
+
+    const T* im_data = im.data<T>();
+    T* col_data = col->data<T>();
+
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_col_offset =
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
+              int col_offset =
+                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+
+              int im_offset = (channel * im_height + im_row_offset) * im_width +
+                              im_col_offset;
+              col_data[col_offset] =
+                  (im_row_offset < 0 || im_row_offset >= im_height ||
+                   im_col_offset < 0 || im_col_offset >= im_width)
+                      ? static_cast<T>(0)
+                      : im_data[im_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im,
+                  const DataLayout data_layout) {
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(col.dims().size(), 5,
+                      "The dimension of col should be 5.");
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[3];
+    int filter_width = col.dims()[4];
+    int col_height = col.dims()[0];
+    int col_width = col.dims()[1];
+
+    PADDLE_ENFORCE_EQ(
+        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
+        col_height,
+        "Output_height and padding(padding_up, padding_down) are "
+        "inconsistent.");
+    PADDLE_ENFORCE_EQ(
+        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
+        col_width,
+        "col_width and padding(padding_left, padding_right) are "
+        "inconsistent.");
+
+    T* im_data = im->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_col_offset =
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
+              int col_offset =
+                  (((col_row_idx * col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+
+              if (im_row_offset >= 0 && im_row_offset < im_height &&
+                  im_col_offset >= 0 && im_col_offset < im_width) {
+                int im_offset =
+                    (channel * im_height + im_row_offset) * im_width +
+                    im_col_offset;
+                im_data[im_offset] += col_data[col_offset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, int64_t>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, int64_t>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/im2col.h
+++ b/core/paddlefl_mpc/operators/math/im2col.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using DataLayout = framework::DataLayout;
+
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum class ColFormat { kCFO = 0, kOCF = 1 };
+
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [input_channels, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * \param dilations    dilation data.
+ * \param 2-dimension  [dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 2-dimension  [stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [input_channels, filter_height, filter_width, output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_height * filter_width, and the width is equal
+ * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_height,
+ *      output_width]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [output_height, output_width, input_channels, filter_height, filter_width]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seq_length, step_size], where the seq_length
+ * is equal output_height * output_width, and the step_size is equal
+ * input_channels * filter_height * filter_width.
+ *
+ * Reshape:
+ *     shape of colData             shape of sequence matrix
+ *     [output_height,
+ *      output_width,
+ *      input_channels,    ======>    [seqLength, stepSize]
+ *      filter_height,
+ *      filter_width]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, typename DeviceContext, typename T>
+class Im2ColFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+
+template <ColFormat Format, typename DeviceContext, typename T>
+class Col2ImFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/im2col_cfo_cpu.h
+++ b/core/paddlefl_mpc/operators/math/im2col_cfo_cpu.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/**
+ * The most common im2col algorithm.
+ * Support dilation, stride and padding.
+ */
+template <typename T>
+inline void im2col_common(const framework::Tensor& im,
+                          const std::vector<int>& dilation,
+                          const std::vector<int>& stride,
+                          const std::vector<int>& padding,
+                          framework::Tensor* col,
+                          const DataLayout data_layout = DataLayout::kNCHW) {
+  int im_channels =
+      (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
+  int im_height =
+      (data_layout != DataLayout::kNHWC ? im.dims()[1] : im.dims()[0]);
+  int im_width =
+      (data_layout != DataLayout::kNHWC ? im.dims()[2] : im.dims()[1]);
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+  int channels_col = im_channels * filter_height * filter_width;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % filter_width;
+    int h_offset = (c / filter_width) % filter_height;
+    int c_im = c / (filter_width * filter_height);
+    for (int h = 0; h < output_height; ++h) {
+      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+      for (int w = 0; w < output_width; ++w) {
+        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+        int im_idx;
+        if (data_layout != DataLayout::kNHWC) {
+          im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+        } else {
+          im_idx = (im_row_idx * im_width + im_col_idx) * im_channels + c_im;
+        }
+        int col_idx = (c * output_height + h) * output_width + w;
+
+        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                             im_col_idx < 0 || im_col_idx >= im_width)
+                                ? static_cast<T>(0)
+                                : im_data[im_idx];
+      }
+    }
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 0
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph0pw0(
+    const framework::Tensor& im, framework::Tensor* col,
+    const DataLayout data_layout = DataLayout::kNCHW) {
+  int im_channels =
+      (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
+  int im_height =
+      (data_layout != DataLayout::kNHWC ? im.dims()[1] : im.dims()[0]);
+  int im_width =
+      (data_layout != DataLayout::kNHWC ? im.dims()[2] : im.dims()[1]);
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int col_matrix_width = output_width * output_height;
+  int im_size = im_height * im_width;
+  size_t copy_size = sizeof(T) * output_width;
+  const T* im_data_oh = im_data;
+  T* dst_data_oh = col_data;
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* src_data_ic = im_data_oh;
+    T* dst_data = dst_data_oh;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = src_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          if (data_layout != DataLayout::kNHWC) {
+            std::memcpy(dst_data, src_data + kw, copy_size);
+          } else {
+            for (int kow = 0; kow < output_width; ++kow) {
+              dst_data[kow] =
+                  im_data[((oh + kh) * im_width + kw + kow) * im_channels + ic];
+            }
+          }
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+      src_data_ic = src_data_ic + im_size;
+    }
+    im_data_oh = im_data_oh + im_width;
+    dst_data_oh = dst_data_oh + output_width;
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 1
+ * and filter_width == 1 have a special implementation
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
+                                      framework::Tensor* col,
+                                      const DataLayout data_layout) {
+  int im_channels =
+      (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
+  int im_height =
+      (data_layout != DataLayout::kNHWC ? im.dims()[1] : im.dims()[0]);
+  int im_width =
+      (data_layout != DataLayout::kNHWC ? im.dims()[2] : im.dims()[1]);
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  constexpr int plh = 1;
+  constexpr int prh = 1;
+  constexpr int plw = 1;
+  constexpr int prw = 1;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int im_size = im_height * im_width;
+  int col_matrix_width = output_width * output_height;
+  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
+  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
+
+  // fill height padding
+  {
+    size_t copy_size = sizeof(T) * output_width;
+    T* col_start_l = col_data;
+    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
+                     col_matrix_width - output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_l = col_start_l;
+      T* dst_data_r = col_start_r;
+      for (int kw = 0; kw < filter_width; ++kw) {
+        std::memset(dst_data_l, 0, copy_size);
+        std::memset(dst_data_r, 0, copy_size);
+        dst_data_l = dst_data_l + col_matrix_width;
+        dst_data_r = dst_data_r + col_matrix_width;
+      }
+      col_start_l = col_start_l + col_block_ic;
+      col_start_r = col_start_r + col_block_ic;
+    }
+  }
+
+  auto pad = static_cast<T>(0);
+  if (filter_width == 1) {
+    // fill width padding
+    T* dst_data_ic = col_data;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_kh = dst_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        T* dst_data = dst_data_kh;
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width - 1;
+          *dst_data = pad;
+          ++dst_data;
+        }
+        dst_data_kh = dst_data_kh + col_block_fh;
+      }
+      dst_data_ic = dst_data_ic + col_block_ic;
+    }
+    // fill core
+    size_t copy_size = sizeof(T) * (output_width - plw - prw);
+    for (int oh = 0; oh < output_height; ++oh) {
+      const T* im_data_start =
+          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+      T* dst_data = col_data + oh * output_width;
+      for (int ic = 0; ic < im_channels; ++ic) {
+        const T* src_data = im_data_start + ic * im_size;
+        for (int kh = 0; kh < filter_height; ++kh) {
+          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                         kh > (filter_height - prh - 1))) {
+            dst_data = dst_data + col_matrix_width;
+            continue;
+          }
+          if (data_layout != DataLayout::kNHWC) {
+            std::memcpy(dst_data + plw, src_data, copy_size);
+          } else {
+            for (int kow = 0; kow < output_width - plw - prw; ++kow) {
+              dst_data[plw + kow] =
+                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
+                           kow) *
+                              im_channels +
+                          ic];
+            }
+          }
+          dst_data = dst_data + col_matrix_width;
+          src_data = src_data + im_width;
+        }
+      }
+    }
+    return;
+  }
+
+  // filter_width != 1
+  // fill width padding
+  T* dst_data_ic = col_data;
+  for (int ic = 0; ic < im_channels; ++ic) {
+    T* dst_data_kh = dst_data_ic;
+    for (int kh = 0; kh < filter_height; ++kh) {
+      for (T* dst_data :
+           {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width +
+                             output_width - 1}) {
+        // TODO(TJ): from plh, saving repeated assignment
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width;
+        }
+      }
+      dst_data_kh = dst_data_kh + col_block_fh;
+    }
+    dst_data_ic = dst_data_ic + col_block_ic;
+  }
+
+  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
+  // (output_width-1)}
+  // length of copy_size is equal kw.
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+    T* dst_data = col_data + oh * output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = im_data_start + ic * im_size;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                       kh > (filter_height - prh - 1))) {
+          dst_data = dst_data + filter_width * col_matrix_width;
+          continue;
+        }
+        // TODO(TJ): reuse plw-kw outside this for
+        // try to unify
+        for (int kw = 0; kw < plw; ++kw) {
+          if (data_layout != DataLayout::kNHWC) {
+            std::memcpy(dst_data + (plw - kw), src_data,
+                        sizeof(T) * (output_width - (plw - kw)));
+          } else {
+            for (int kow = 0; kow < output_width - (plw - kw); ++kow) {
+              dst_data[plw - kw + kow] =
+                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
+                           kow) *
+                              im_channels +
+                          ic];
+            }
+          }
+          dst_data = dst_data + col_matrix_width;
+        }
+        for (int kw = plw; kw < filter_width - prw; ++kw) {
+          if (data_layout != DataLayout::kNHWC) {
+            std::memcpy(dst_data, src_data + (kw - plw),
+                        sizeof(T) * output_width);
+          } else {
+            for (int kow = 0; kow < output_width; ++kow) {
+              dst_data[kow] =
+                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
+                           kw - plw + kow) *
+                              im_channels +
+                          ic];
+            }
+          }
+          dst_data = dst_data + col_matrix_width;
+        }
+        int i = 1;
+        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
+          if (data_layout != DataLayout::kNHWC) {
+            std::memcpy(dst_data, src_data + (kw - plw),
+                        sizeof(T) * (output_width - i));
+          } else {
+            for (int kow = 0; kow < output_width - i; ++kow) {
+              dst_data[kow] =
+                  im_data[(((oh - plh > 0 ? oh - plh : 0) + kh) * im_width +
+                           kw - plw + kow) *
+                              im_channels +
+                          ic];
+            }
+          }
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/math_function.cc
+++ b/core/paddlefl_mpc/operators/math/math_function.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "core/paddlefl_mpc/operators/math/math_function.h"
+
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+    void operator()(const platform::CPUDeviceContext& context,
+                    const framework::Tensor& input,
+                    const framework::Tensor& vector, framework::Tensor* output) {
+        auto in_dims = input.dims();
+        auto size = input.numel() / in_dims[0];
+        PADDLE_ENFORCE_EQ(vector.numel(), size);
+        PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+
+        auto in = framework::EigenMatrix<T>::From(input);
+        auto vec = framework::EigenVector<T>::Flatten(vector);
+        auto out = framework::EigenMatrix<T>::From(*output);
+
+        for (int64_t i = 0; i < in_dims[0]; ++i) {
+            out.chip(i, 0) = in.chip(i, 0) + vec;
+        }
+    }
+};
+
+template struct RowwiseAdd<platform::CPUDeviceContext, int64_t>;
+
+using float16 = paddle::platform::float16;
+
+template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::CPUDeviceContext, float>;
+template struct SetConstant<platform::CPUDeviceContext, double>;
+template struct SetConstant<platform::CPUDeviceContext, int>;
+template struct SetConstant<platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<platform::CPUDeviceContext, bool>;
+template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
+
+#define DEFINE_CPU_TRANS(RANK)                                             \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
+                            RANK>;                                         \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;    \
+  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;
+
+DEFINE_CPU_TRANS(1);
+DEFINE_CPU_TRANS(2);
+DEFINE_CPU_TRANS(3);
+DEFINE_CPU_TRANS(4);
+DEFINE_CPU_TRANS(5);
+DEFINE_CPU_TRANS(6);
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/math_function.h
+++ b/core/paddlefl_mpc/operators/math/math_function.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+struct RowwiseAdd {
+    void operator()(const DeviceContext& context, const framework::Tensor& input,
+                    const framework::Tensor& vec, framework::Tensor* output);
+};
+
+template <typename DeviceContext, typename T>
+struct SetConstant {
+    void operator()(const DeviceContext& context, framework::Tensor* tensor,
+                    T num);
+};
+
+template <typename DeviceContext, typename T, int Rank>
+struct Transpose {
+    void operator()(const DeviceContext& context, const framework::Tensor& in,
+                    framework::Tensor* out, const std::vector<int>& axis);
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum {
+    void operator()(const DeviceContext& context, const framework::Tensor& input,
+                    framework::Tensor* vec);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/math_function_impl.h
+++ b/core/paddlefl_mpc/operators/math/math_function_impl.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "core/paddlefl_mpc/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
+        framework::Tensor* tensor,
+        T num) {
+    auto t = framework::EigenVector<T>::Flatten(*tensor);
+    t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
+}
+
+template <typename DeviceContext, typename T, int Rank>
+void Transpose<DeviceContext, T, Rank>::operator()(
+    const DeviceContext& context, const framework::Tensor& in,
+    framework::Tensor* out, const std::vector<int>& axis) {
+    Eigen::array<int, Rank> permute;
+    for (int i = 0; i < Rank; i++) {
+        permute[i] = axis[i];
+    }
+    auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
+    auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
+    auto* dev = context.eigen_device();
+    eigen_out.device(*dev) = eigen_in.shuffle(permute);
+}
+
+template <typename DeviceContext, typename T>
+void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+        const framework::Tensor& input,
+        framework::Tensor* out) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(*out);
+
+    vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
+}
+
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// colwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class ColwiseSum<platform::CPUDeviceContext, T> {
+public:
+    void operator()(const platform::CPUDeviceContext& context,
+                    const framework::Tensor& input, framework::Tensor* out) {
+        auto& in_dims = input.dims();
+        auto height = in_dims[0];
+        auto size = in_dims[1];
+        PADDLE_ENFORCE_EQ(out->numel(), size);
+
+        T* out_buf = out->mutable_data<T>(out->place());
+        const T* in_buf = input.data<T>();
+
+        for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+            for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+                if (i == 0) {
+                    out_buf[j] = in_buf[i * size + j];
+                } else {
+                    out_buf[j] += in_buf[i * size + j];
+                }
+            }
+        }
+    }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/sequence2batch.cc
+++ b/core/paddlefl_mpc/operators/math/sequence2batch.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "core/paddlefl_mpc/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
+public:
+    void operator()(const platform::CPUDeviceContext& context,
+                    const framework::Tensor& src,
+                    framework::Vector<size_t> index_lod, framework::Tensor* dst,
+                    bool is_src_index) {
+        size_t* index = index_lod.data();
+        auto src_dims = src.dims();
+        auto dst_dims = dst->dims();
+        PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
+                          "The src must be matrix with rank 2.");
+        PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
+                          "The dst must be matrix with rank 2.");
+        PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
+                          "The width of src and dst must be same.");
+        auto height = dst_dims[0];
+        auto width = dst_dims[1];
+        auto* src_data = src.data<T>();
+        auto* dst_data = dst->data<T>();
+        const int sz = width * sizeof(T);
+        if (is_src_index) {
+            for (int i = 0; i < height; ++i) {
+                memcpy(dst_data + i * width, src_data + index[i] * width, sz);
+            }
+        } else {
+            for (int i = 0; i < height; ++i) {
+                memcpy(dst_data + index[i] * width, src_data + i * width, sz);
+            }
+        }
+    }
+};
+
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, int64_t>;
+
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, int64_t>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/sequence2batch.h
+++ b/core/paddlefl_mpc/operators/math/sequence2batch.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class CopyMatrixRowsFunctor {
+public:
+    // If is_src_index is true,
+    // copy the indexed rows of input src to the output dst.
+    // If is_src_index is false,
+    // copy the input src to the indexed rows of output dst.
+    // The indexed rows are based on the input index.
+    void operator()(const DeviceContext& context, const framework::Tensor& src,
+                    framework::Vector<size_t> index_lod, framework::Tensor* dst,
+                    bool is_src_index);
+};
+
+template <typename DeviceContext, typename T>
+class LoDTensor2BatchFunctor {
+    // Calculate the length of each sequence and
+    // sort sequence index by the length.
+    // example:  sequences = {s0, s1, s2}
+    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+    //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
+    //
+    struct SeqInfo {
+        SeqInfo(size_t start, size_t length, size_t seq_idx)
+            : start(start), length(length), seq_idx(seq_idx) {}
+        size_t start;
+        size_t length;
+        size_t seq_idx;
+    };
+
+public:
+    void operator()(const DeviceContext& context,
+                    const framework::LoDTensor& lod_tensor,
+                    framework::LoDTensor* batch, bool is_cal_batch_lod,
+                    bool is_reverse = false) const {
+        if (!is_cal_batch_lod) {
+            auto lods = batch->lod();
+            PADDLE_ENFORCE_GT(lods.size(), 2UL,
+                              "The LoD of LoDTensor should inlcude at least 2-level "
+                              "sequence information.");
+            PADDLE_ENFORCE_EQ(
+                lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]),
+                "The LoD information should be consistent with the dims.");
+            CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
+            to_batch(context, lod_tensor, lods[1], batch, true);
+            return;
+        }
+
+        auto lods = lod_tensor.lod();
+        PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+
+        const auto& lod = lods[0];
+
+        std::vector<SeqInfo> seq_info;
+        for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
+            size_t length = lod[seq_id + 1] - lod[seq_id];
+            seq_info.emplace_back(lod[seq_id], length, seq_id);
+        }
+
+        std::sort(seq_info.begin(), seq_info.end(),
+        [](SeqInfo a, SeqInfo b) {
+            return a.length > b.length;
+        });
+
+        // Calculate the start position of each batch.
+        // example:  sequences = {s0, s1, s2}
+        //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+        //           max_seqlen = 5,
+        //           batchIndex = {b0, b1, b2, b3, b4}
+        //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+        //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
+        //              batch_start_positions[0] = len(b0)
+        //              batch_start_positions[1] = len(b0) + len(b1)
+        //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
+        //              ...
+        //           seq2batch_idx[12] = {4, 0, 9,
+        //                                5, 1, 10,
+        //                                6, 2, 11,
+        //                                7, 3,
+        //                                8}
+        //           seq_order = {1, 0, 2}, the sort order.
+        //               where 1 is the second sequence,
+        //                     0 is the first sequence,
+        //                     2 is the third sequence.
+        // The max_seqlen represents batch size after rearranging the
+        // input LodTensor. It is also the maximum length of input sequence.
+
+        paddle::framework::LoD batch_lods;
+        batch_lods.emplace_back(std::vector<size_t> {0});
+        batch_lods.emplace_back(std::vector<size_t> {0});
+        batch_lods.emplace_back(std::vector<size_t> {0});
+
+        // batch_lods[0] is the start positions for batch LoDTensor
+        size_t max_seqlen = seq_info[0].length;
+        batch_lods[0].resize(max_seqlen + 1);
+        // batch_lods[1] is the raw index in the input LoDTensor
+        batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+        // batch_lods[2] is the sort order for the input LoDTensor.
+        batch_lods[2].resize(seq_info.size());
+
+        size_t* batch_starts = batch_lods[0].data();
+        size_t* seq2batch_idx = batch_lods[1].data();
+        batch_starts[0] = 0;
+        for (size_t n = 0; n < max_seqlen; n++) {
+            size_t batch_id = batch_starts[n];
+            for (size_t i = 0; i < seq_info.size(); ++i) {
+                size_t seq_len = seq_info[i].length;
+                size_t start = seq_info[i].start;
+                if (n < seq_len) {
+                    seq2batch_idx[batch_id] =
+                        is_reverse ? start + seq_len - 1 - n : start + n;
+                    batch_id++;
+                } else {
+                    break;
+                }
+            }
+            batch_starts[n + 1] = batch_id;
+        }
+        size_t* seq_order = batch_lods[2].data();
+        for (size_t i = 0; i < seq_info.size(); ++i) {
+            seq_order[i] = seq_info[i].seq_idx;
+        }
+        batch->set_lod(batch_lods);
+
+        CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
+        to_batch(context, lod_tensor, batch_lods[1], batch, true);
+    }
+};
+
+template <typename DeviceContext, typename T>
+class Batch2LoDTensorFunctor {
+public:
+    void operator()(const DeviceContext& context,
+                    const framework::LoDTensor& batch,
+                    framework::LoDTensor* lod_tensor) const {
+        auto in_lod = batch.lod();
+        PADDLE_ENFORCE_GT(in_lod.size(), 2UL,
+                          "The LoD of LoDTensor should inlcude at least 2-level "
+                          "sequence information.");
+        PADDLE_ENFORCE_EQ(
+            in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]),
+            "The LoD information should be consistent with the dims.");
+        CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
+        to_seq(context, batch, in_lod[1], lod_tensor, false);
+    }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/vol2col.cc
+++ b/core/paddlefl_mpc/operators/math/vol2col.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "./vol2col.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * vol = [input_channels, input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Vol2ColFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* col,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
+                      "The dimension of vol should be 4.");
+    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
+                      "The dimension of col should be 7.");
+
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    // changed
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+
+    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "mismatching.");
+    const T* vol_data = vol.data<T>();
+    T* col_data = col->data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int c_in = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+
+            int col_idx =
+                ((c * output_depth + d) * output_height + h) * output_width + w;
+            int vol_idx;
+            if (data_layout != DataLayout::kNHWC) {
+              vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                            input_width +
+                        w_pad;
+            } else {
+              vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                            input_channels +
+                        c_in;
+            }
+            col_data[col_idx] =
+                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
+                    ? static_cast<T>(0)
+                    : vol_data[vol_idx];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * vol = [input_channels,input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Col2VolFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* vol,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
+                      "The dimension of vol should be 4.");
+    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
+                      "The dimension of col should be 7.");
+
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+
+    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "mismatching.");
+    T* vol_data = vol->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int cIm = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+
+            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
+              int vol_idx;
+              if (data_layout != DataLayout::kNHWC) {
+                vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
+                              input_width +
+                          w_pad;
+              } else {
+                vol_idx =
+                    ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                        input_channels +
+                    cIm;
+              }
+              int col_idx =
+                  ((c * output_depth + d) * output_height + h) * output_width +
+                  w;
+              vol_data[vol_idx] += col_data[col_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Vol2ColFunctor<platform::CPUDeviceContext, int64_t>;
+template class Col2VolFunctor<platform::CPUDeviceContext, int64_t>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/math/vol2col.h
+++ b/core/paddlefl_mpc/operators/math/vol2col.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using DataLayout = framework::DataLayout;
+
+/*
+ * \brief Converts the feature data of four dimensions(CDHW) into a colData of
+ *        seven dimensions in the Vol2ColFunctor calculation,
+ *        And in the Col2VolFunctor calculation, it is reversed.
+ *
+ * \param volData   Vol data.
+ * \param volShape  The shape of volData,
+ *                 [input_channels, input_depth, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * \param dilations    dilation data.
+ * \param 3-dimension  [dilation_depth, dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 3-dimension  [stride_depth, stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 3-dimension  [d_pad, h_pad, w_pad].
+ *
+ * The shape of colData is:
+ * [input_channels, filter_depth, filter_height, filter_width, output_depth,
+ * output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_depth * filter_height * filter_width, and the width
+ * is equal output_depth * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_depth,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_depth,
+ *      output_height,
+ *      output_width]
+ *
+ * \note The caller needs to ensure that volShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <typename DeviceContext, typename T>
+class Vol2ColFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* col,
+                  const DataLayout data_layout = DataLayout::kNCHW) const;
+};
+
+template <typename DeviceContext, typename T>
+class Col2VolFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* vol,
+                  const DataLayout data_layout = DataLayout::kNCHW) const;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/mpc_adam_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_adam_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "mpc_adam_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class MpcAdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override;
+};
+
+void MpcAdamOp::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("Param"), true,
+      platform::errors::NotFound("Input(Param) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("Grad"), true,
+      platform::errors::NotFound("Input(Grad) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Moment1"), true,
+                    platform::errors::NotFound(
+                        "Input(Moment1) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Moment2"), true,
+                    platform::errors::NotFound(
+                        "Input(Moment2) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
+                    platform::errors::NotFound(
+                        "Input(LearningRate) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Beta1Pow"), true,
+                    platform::errors::NotFound(
+                        "Input(Beta1Pow) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasInput("Beta2Pow"), true,
+                    platform::errors::NotFound(
+                        "Input(Beta2Pow) of AdamOp should not be null."));
+
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
+                    platform::errors::NotFound(
+                        "Output(ParamOut) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment1Out"), true,
+                    platform::errors::NotFound(
+                        "Output(Moment1Out) of AdamOp should not be null."));
+  PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment2Out"), true,
+                    platform::errors::NotFound(
+                        "Output(Moment2Out) of AdamOp should not be null."));
+
+  auto lr_dims = ctx->GetInputDim("LearningRate");
+  PADDLE_ENFORCE_NE(
+      framework::product(lr_dims), 0,
+      platform::errors::InvalidArgument(
+          "The number of LearningRate shall not be 0, but received %d. Maybe "
+          "the Input variable LearningRate has not "
+          "been initialized. You may need to confirm "
+          "if you put exe.run(startup_program) "
+          "after optimizer.minimize function.",
+          framework::product(lr_dims)));
+  PADDLE_ENFORCE_EQ(
+      framework::product(lr_dims), 1,
+      platform::errors::InvalidArgument(
+          "Learning rate should have 1 dimension, but received %d",
+          framework::product(lr_dims)));
+  auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+  VLOG(3) << "dims of Beta1Pow : [" << beta1_pow_dims << "]";
+  PADDLE_ENFORCE_GE(framework::product(beta1_pow_dims), 1,
+                    platform::errors::InvalidArgument(
+                        "The size of Beta1 power accumulator should be greater "
+                        "than 0, but received %d.",
+                        framework::product(beta1_pow_dims)));
+  auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
+  VLOG(3) << "dims of Beta2Pow : [" << beta2_pow_dims << "]";
+  PADDLE_ENFORCE_GE(framework::product(beta2_pow_dims), 1,
+                    platform::errors::InvalidArgument(
+                        "The size of Beta2 power accumulator should be greater "
+                        "than 0, but received %d.",
+                        framework::product(beta2_pow_dims)));
+
+  auto param_dims = ctx->GetInputDim("Param");
+  if (ctx->GetInputsVarType("Grad")[0] ==
+      framework::proto::VarType::LOD_TENSOR) {
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        platform::errors::InvalidArgument(
+            "Param and Grad input of AdamOp should have same dimension. But "
+            "received Param dims: [%s], Grad dims: [%s].",
+            param_dims, ctx->GetInputDim("Grad")));
+  }
+  PADDLE_ENFORCE_EQ(
+      param_dims, ctx->GetInputDim("Moment1"),
+      platform::errors::InvalidArgument(
+          "Param and Moment1 input of AdamOp should have same dimension. But "
+          "received Param dims: [%s], Moment1 dims: [%s].",
+          param_dims, ctx->GetInputDim("Moment1")));
+  PADDLE_ENFORCE_EQ(
+      param_dims, ctx->GetInputDim("Moment2"),
+      platform::errors::InvalidArgument(
+          "Param and Moment2 input of AdamOp should have same dimension. But "
+          "received Param dims: [%s], Moment2 dims: [%s].",
+          param_dims, ctx->GetInputDim("Moment2")));
+
+  ctx->SetOutputDim("ParamOut", param_dims);
+  ctx->SetOutputDim("Moment1Out", param_dims);
+  ctx->SetOutputDim("Moment2Out", param_dims);
+  ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
+  ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims);
+}
+
+framework::OpKernelType MpcAdamOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Param");
+  return framework::OpKernelType(input_data_type, ctx.GetPlace());
+}
+
+framework::OpKernelType MpcAdamOp::GetKernelTypeForVar(
+    const std::string &var_name, const framework::Tensor &tensor,
+    const framework::OpKernelType &expected_kernel_type) const {
+  if (var_name == "Beta1Pow" || var_name == "Beta2Pow") {
+    return expected_kernel_type;
+  } else {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+}
+
+class MpcAdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment1", "(Tensor) Input first moment");
+    AddInput("Moment2", "(Tensor) Input second moment");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+
+    AddInput("Beta1Tensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as beta1, this has a higher priority than attr(beta1), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
+    AddInput("Beta2Tensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as beta2, this has a higher priority than attr(beta2), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment");
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
+    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-4) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-4f);
+
+    AddComment(R"DOC(
+Adam Optimizer.
+
+This implements the Adam optimizer from Section 2 of the Adam
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
+
+Adam updates:
+
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    mpc_adam, ops::MpcAdamOp, ops::MpcAdamOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    mpc_adam,
+    ops::MpcAdamOpKernel<paddle::platform::CPUDeviceContext, int64_t, float>);
--- a/core/paddlefl_mpc/operators/mpc_adam_op.h
+++ b/core/paddlefl_mpc/operators/mpc_adam_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License. */
+
+#pragma once
+
+#include "mpc_op.h"
+
+#include <math.h>
+
+#include "./math/math_function.h"
+#include "core/paddlefl_mpc/mpc_protocol/aby3_operators.h"
+
+namespace paddle {
+namespace operators {
+
+static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
+  const float* tensor_data = tensor->data<float>();
+  framework::Tensor cpu_tensor;
+  return tensor_data[0];
+}
+
+template <typename DeviceContext, typename T, typename T1>
+class MpcAdamOpKernel : public MpcOpKernel<T> {
+  public:
+    void ComputeImpl(const framework::ExecutionContext &ctx) const override{
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
+                   "The Var(%s)'s type should be LoDTensor, "
+                   "but the received is %s",
+                   ctx.InputNames("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
+
+    using paddle::framework::LoDTensor;
+
+    T1 epsilon = static_cast<T1>(ctx.Attr<float>("epsilon"));
+    auto* param = ctx.Input<LoDTensor>("Param");
+    auto* grad_var = ctx.InputVar("Grad");
+    auto* mom1 = ctx.Input<LoDTensor>("Moment1");
+    auto* mom2 = ctx.Input<LoDTensor>("Moment2");
+    auto* lr = ctx.Input<LoDTensor>("LearningRate");
+
+    auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
+
+    auto* param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
+    auto* mom2_out = ctx.Output<LoDTensor>("Moment2Out");
+    auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
+    auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
+
+    T1 beta1 = static_cast<T1>(ctx.Attr<float>("beta1"));
+    if (ctx.HasInput("Beta1Tensor")) {
+      auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta1Tensor) size must be 1, but get %d",
+                            beta1_tensor->numel()));
+      beta1 = static_cast<T1>(GetAttrFromTensor(beta1_tensor));
+    }
+    T1 beta2 = static_cast<T1>(ctx.Attr<float>("beta2"));
+    if (ctx.HasInput("Beta2Tensor")) {
+      auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta2Tensor) size must be 1, but get %d",
+                            beta2_tensor->numel()));
+      beta2 = static_cast<T1>(GetAttrFromTensor(beta2_tensor));
+    }
+    VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
+            << "beta2_pow.numel() : " << beta2_pow->numel();
+    VLOG(3) << "param.numel(): " << param->numel();
+
+    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta1 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta1_pow_out->numel()));
+
+    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta2 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta2_pow_out->numel()));
+
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto* grad = ctx.Input<LoDTensor>("Grad");
+
+      // AdamFunctor<T, CPUAdam> functor(
+      //     beta1, beta2, epsilon, beta1_pow->data<T>(), beta2_pow->data<T>(),
+      //     mom1->data<T>(), mom1_out->mutable_data<T>(ctx.GetPlace()),
+      //     mom2->data<T>(), mom2_out->mutable_data<T>(ctx.GetPlace()),
+      //     lr->data<T>(), grad->data<T>(), param->data<T>(),
+      //     param_out->mutable_data<T>(ctx.GetPlace()));
+      // functor(param->numel());
+
+      T1 lr_value = *lr->template data<T1>();
+
+      T1 beta1_pow_ = *beta1_pow->template data<T1>();
+      T1 beta2_pow_ = *beta2_pow->template data<T1>();
+
+      double lr_ =  lr_value * sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
+
+      framework::Tensor temp;
+      temp.mutable_data<T>(param->dims(), ctx.GetPlace());
+
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(grad, (1 - beta1), &temp);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(mom1, beta1, mom1_out);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->add(mom1_out, &temp, mom1_out);
+
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(grad, (1 - beta2), &temp);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->mul(grad, &temp, &temp);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(mom2, beta2, mom2_out);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->add(mom2_out, &temp, mom2_out);
+
+      // mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(grad, lr[0], &temp);
+      // mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->sub(param, &temp, param_out);
+
+      math::SetConstant<DeviceContext, T> set_const;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      set_const(
+          dev_ctx,
+          &temp,
+          T(epsilon * pow(2, mpc::ABY3_SCALING_FACTOR) / 3));
+
+      // temp = epsilon + mom2_out
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->add(mom2_out, &temp, &temp);
+      // temp = 1 / sqrt(epsilon + mom2_out)
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->inverse_square_root(&temp, &temp);
+      // temp = mom1_out / sqrt(epsilon + mom2_out)
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->mul(mom1_out, &temp, &temp);
+      // temp = lr * mom1_out / sqrt(epsilon + mom2_out)
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(&temp, lr_, &temp);
+      mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->sub(param, &temp, param_out);
+
+      beta1_pow_out->mutable_data<T1>(ctx.GetPlace())[0] =
+          beta1 * beta1_pow->template data<T1>()[0];
+      beta2_pow_out->mutable_data<T1>(ctx.GetPlace())[0] =
+          beta2 * beta2_pow->template data<T1>()[0];
+
+    } else {
+      PADDLE_THROW("Variable type not supported by adam_op");
+    }
+
+    }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/mpc_batch_norm_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_batch_norm_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "mpc_batch_norm_op.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class MpcBatchNormOp : public framework::OperatorWithKernel {
+public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+
+    void InferShape(framework::InferShapeContext* ctx) const override{
+        OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchNorm");
+        OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNorm");
+        OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "BatchNorm");
+        OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "BatchNorm");
+        OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance", "BatchNorm");
+        OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "BatchNorm");
+
+        bool is_test = ctx->Attrs().Get<bool>("is_test");
+        bool trainable_stats = ctx->Attrs().Get<bool>("trainable_statistics");
+        bool test_mode = is_test && (!trainable_stats);
+        if (!test_mode) {
+            OP_INOUT_CHECK(ctx->HasOutput("MeanOut"), "Output", "MeanOut", "BatchNorm");
+            OP_INOUT_CHECK(ctx->HasOutput("VarianceOut"), "Output", "VarianceOut",
+                           "BatchNorm");
+            OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
+                           "BatchNorm");
+            OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance",
+                           "BatchNorm");
+        }
+
+        // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
+        PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
+                          platform::errors::InvalidArgument(
+                              "Mean and MeanOut should share the same memory"));
+
+        PADDLE_ENFORCE_EQ(
+            ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0],
+            platform::errors::InvalidArgument(
+                "Variance and VarianceOut should share the same memory"));
+
+        const auto x_dims = ctx->GetInputDim("X");
+        const DataLayout data_layout = framework::StringToDataLayout(
+            ctx->Attrs().Get<std::string>("data_layout"));
+
+        if (ctx->IsRuntime() && ctx->HasInput("MomentumTensor")) {
+            auto mom = ctx->Inputs("MomentumTensor");
+            PADDLE_ENFORCE_EQ(mom.size(), 1,
+                              platform::errors::InvalidArgument(
+                                  "The input tensor MomentumTensor's size must be 1"
+                                  "But received: MomentumTensor's size is [%d]",
+                                  mom.size()));
+        }
+
+        PADDLE_ENFORCE_GE(
+            x_dims.size(), 3,
+            platform::errors::InvalidArgument(
+                "ShapeError: the dimension of input "
+                "X must greater than or equal to 3. But received: the shape of input "
+                "X = [%s], the dimension of input X =[%d]",
+                x_dims, x_dims.size()));
+
+        PADDLE_ENFORCE_LE(
+            x_dims.size(), 6,
+            platform::errors::InvalidArgument(
+                "ShapeError: the dimension of input X "
+                "must smaller than or equal to 6. But received: the shape of input X "
+                "= [%s], the dimension of input X = [%d]",
+                x_dims, x_dims.size()));
+
+        
+        const int64_t C =
+            ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+                 ? x_dims[2]
+                 : x_dims[x_dims.size() - 1]);
+
+        auto scale_dim = ctx->GetInputDim("Scale");
+        auto bias_dim = ctx->GetInputDim("Bias");
+        VLOG(3) << "*** scale_dims: " << scale_dim;
+        VLOG(3) << "*** bias_dims: " << bias_dim;
+        VLOG(3) << "*** mean_dims: " << ctx->GetInputDim("Mean");
+        VLOG(3) << "*** variance_dims: " << ctx->GetInputDim("Variance");
+        //VLOG(3) << "*** Y_dims: " << ctx->GetInputDim("Y");
+
+        PADDLE_ENFORCE_EQ(
+            scale_dim.size(), 2UL,
+            platform::errors::InvalidArgument(
+                "ShapeError: the dimension of scale must equal to 2."
+                "But received: the shape of scale is [%s], the dimension "
+                "of scale is [%d]",
+                scale_dim, scale_dim.size()));
+        PADDLE_ENFORCE_EQ(bias_dim.size(), 2UL,
+            platform::errors::InvalidArgument(
+                "ShapeError: the dimension of bias must equal to 2."
+                "But received: the shape of bias is [%s],the dimension "
+                "of bias is [%d]",
+                bias_dim, bias_dim.size()));
+
+        bool check = true;
+        if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
+                                    framework::product(bias_dim) <= 0)) {
+            check = false;
+        }
+
+        if (check) {
+            PADDLE_ENFORCE_EQ(scale_dim[1], C,
+                platform::errors::InvalidArgument(
+                    "ShapeError: the shape of scale must equal to [%d]"
+                    "But received: the shape of scale is [%d]",
+                    C, scale_dim[1]));
+            PADDLE_ENFORCE_EQ(bias_dim[1], C,
+                platform::errors::InvalidArgument(
+                    "ShapeError: the shape of bias must equal to [%d]"
+                    "But received: the shape of bias is [%d]",
+                    C, bias_dim[1]));
+        }
+        ctx->SetOutputDim("Y", x_dims);
+        ctx->SetOutputDim("MeanOut", {2, C}); // 2: share_num
+        ctx->SetOutputDim("VarianceOut", {2, C});
+        ctx->SetOutputDim("SavedMean", {2, C});
+        ctx->SetOutputDim("SavedVariance", {2, C});
+        ctx->ShareLoD("X", "Y");
+  }
+
+protected:
+    framework::OpKernelType GetExpectedKernelType(const framework::ExecutionContext& ctx) const {
+        framework::LibraryType library_{framework::LibraryType::kPlain};
+        std::string data_format = "AnyLayout";
+        framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
+        return framework::OpKernelType(
+            OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(),
+            layout_, library_);
+    }
+
+    framework::OpKernelType GetKernelTypeForVar(
+            const std::string& var_name, const Tensor& tensor,
+            const framework::OpKernelType& expected_kernel_type) const {
+        return framework::OpKernelType(expected_kernel_type.data_type_, 
+                                       tensor.place(), tensor.layout());
+    }
+};
+
+
+class MpcBatchNormGradOp : public framework::OperatorWithKernel {
+public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+
+    void InferShape(framework::InferShapeContext* ctx) const override{
+        // check input
+        OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNormGrad");
+        OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
+                       framework::GradVarName("Y"), "BatchNormGrad");
+        OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
+                       "BatchNormGrad");
+        OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
+                       "BatchNormGrad");
+
+        // check output
+        OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                       framework::GradVarName("X"), "BatchNormGrad");
+
+        const bool has_scale_grad = ctx->HasOutput(framework::GradVarName("Scale"));
+        const bool has_bias_grad = ctx->HasOutput(framework::GradVarName("Bias"));
+
+        PADDLE_ENFORCE_EQ((has_scale_grad == has_bias_grad), true,
+                          platform::errors::NotFound(
+                              "Output(Scale@GRAD) and Output(Bias@GRAD) must be null "
+                              "or not be null at same time. But now, "
+                              "has Scale@Grad=[%d], has Bias@GRAD=[%d]",
+                              has_scale_grad, has_bias_grad));
+
+        const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
+        if (use_global_stats) {
+            PADDLE_ENFORCE_EQ(
+                !ctx->Attrs().Get<bool>("use_mkldnn"), true,
+                platform::errors::InvalidArgument(
+                    "Using global stats during training is not supported "
+                    "in gradient op kernel of batch_norm_mkldnn_op now."));
+        }
+
+        OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchNormGrad");
+        const auto x_dims = ctx->GetInputDim("X");
+        const DataLayout data_layout = framework::StringToDataLayout(
+            ctx->Attrs().Get<std::string>("data_layout"));
+
+        const int C =
+            ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+                ? x_dims[2]
+                : x_dims[x_dims.size() - 1]);
+
+        ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+        // has_scale_grad == has_bias_grad, judge has_scale_grad is enough
+        if (has_scale_grad) {
+            ctx->SetOutputDim(framework::GradVarName("Scale"), {2, C}); // 2: share_num
+            ctx->SetOutputDim(framework::GradVarName("Bias"), {2, C});
+        }
+    }
+
+protected:
+    framework::OpKernelType GetExpectedKernelType(const framework::ExecutionContext& ctx) const {
+        framework::LibraryType library_{framework::LibraryType::kPlain};
+        std::string data_format = "AnyLayout";
+        framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
+        auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+        return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_);
+    }
+
+    framework::OpKernelType GetKernelTypeForVar(
+            const std::string& var_name, const Tensor& tensor,
+            const framework::OpKernelType& expected_kernel_type) const {
+        return framework::OpKernelType(expected_kernel_type.data_type_,
+                                       tensor.place(), tensor.layout());
+    }
+};
+
+
+class MpcBatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+    void Make() {
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
+  AddAttr<float>("momentum", "").SetDefault(0.9);
+  AddAttr<float>("epsilon", "")
+      .SetDefault(1e-5)
+      .AddCustomChecker([](const float &epsilon) {
+        PADDLE_ENFORCE_GE(
+            epsilon, 0.0f,
+            platform::errors::InvalidArgument(
+                "'epsilon' should be greater or equal than 0.0."));
+        PADDLE_ENFORCE_LE(epsilon, 0.001f,
+                          platform::errors::InvalidArgument(
+                              "'epsilon' should be less or equal than 0.001."));
+      });
+  AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
+  AddInput("X", "The input tensor");
+  AddInput("Scale",
+           "Scale is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Bias",
+           "Bias is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Mean",
+           "The global mean (for training) or "
+           "estimated mean (for testing)");
+  AddInput("Variance",
+           "The global variance (for training) "
+           "or estimated Variance (for testing)");
+  AddInput("MomentumTensor",
+           "(Tensor<float32>, optional) If provided, batch_norm will "
+           "use this as momentum, this has a higher priority than "
+           "attr(momentum), the shape of this tensor MUST BE [1].")
+      .AsDispensable();
+  AddOutput("Y", "result after normalization");
+  AddOutput("MeanOut",
+            "Share memory with Mean. "
+            "Store the global mean when training");
+  AddOutput("VarianceOut",
+            "Share memory with Variance. "
+            "Store the global Variance when training");
+  AddOutput("SavedMean",
+            "Mean of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("SavedVariance",
+            "Variance of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("ReserveSpace",
+            "Reserve GPU space for triggering the new semi-persistent "
+            "NHWC kernel")
+      .AsDispensable();
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("fuse_with_relu",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("use_global_stats",
+                "(bool, default false) Whether to use global mean and "
+                "variance. In inference or test mode, set use_global_stats "
+                "to true or is_test true. the behavior is equivalent. "
+                "In train mode, when setting use_global_stats True, the "
+                "global mean and variance are also used during train time, "
+                "the BN acts as scaling and shiffting.")
+      .SetDefault(false);
+  AddAttr<bool>("trainable_statistics",
+                "(bool, default false) Whether to calculate mean and variance "
+                "in test mode. If setting true in test mode, mean and variace "
+                "will be calculated by current batch statistics.")
+      .SetDefault(false);
+  AddComment(R"DOC(
+Batch Normalization.
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Can be used as a normalizer function for conv2d and fully_connected operations.
+The required data format for this layer is one of the following:
+1. NHWC `[batch, in_height, in_width, in_channels]`
+2. NCHW `[batch, in_channels, in_height, in_width]`
+)DOC");
+    }
+};
+
+template <typename T>
+class MpcBatchNormGradOpMaker : public framework::SingleGradOpMaker<T> {
+public:
+    using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+protected:
+    void Apply(GradOpPtr<T> op) const {
+        op->SetType(this->ForwardOpType() + "_grad");
+        op->SetInput("X", this->Input("X"));
+        op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+        op->SetInput("Scale", this->Input("Scale"));
+        op->SetInput("Bias", this->Input("Bias"));
+        op->SetInput("SavedMean", this->Output("SavedMean"));
+        op->SetInput("SavedVariance", this->Output("SavedVariance"));
+        if (this->HasOutput("ReserveSpace")) {
+            op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
+        }
+
+        // used when setting use_global_stats True during training
+        if (boost::get<bool>(this->GetAttr("use_global_stats"))) {
+            op->SetInput("Mean", this->Output("MeanOut"));
+            op->SetInput("Variance", this->Output("VarianceOut"));
+        }
+        op->SetAttrMap(this->Attrs());
+        op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+        op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
+        op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+    }
+};
+
+
+class MpcBatchNormOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+protected:
+    std::unordered_map<std::string, std::string>& GetInputOutputWithSameType() const override {
+        static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
+        return m;
+    }
+};
+
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    mpc_batch_norm, ops::MpcBatchNormOp, ops::MpcBatchNormOpMaker, 
+    ops::MpcBatchNormOpInferVarType,
+    ops::MpcBatchNormGradOpMaker<paddle::framework::OpDesc>,
+    ops::MpcBatchNormGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(mpc_batch_norm_grad, ops::MpcBatchNormGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    mpc_batch_norm, ops::MpcBatchNormKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    mpc_batch_norm_grad, ops::MpcBatchNormGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/core/paddlefl_mpc/operators/mpc_batch_norm_op.h
+++ b/core/paddlefl_mpc/operators/mpc_batch_norm_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "mpc_op.h"
+#include "./math/math_function.h"
+#include "core/paddlefl_mpc/mpc_protocol/mpc_operators.h"
+
+namespace paddle {
+namespace operators {
+
+using DDim = framework::DDim;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+std::shared_ptr<mpc::MpcOperators> mpc_operators;
+// TODO: remove dependency on aby3 protocol
+const int MPC_ONE_SHARE = (1 << paddle::mpc::FIXED_POINTER_SCALING_FACTOR) / 3;
+
+template <typename T>
+void Expand(const Tensor* input, Tensor* output, int S, int N, int C, int sample_size) {
+    // Expand tensor into specified shape
+    // input shape: {S, C}
+    // outout shape: {S, N, C, H, W}, sample_size = H * W
+    const T* input_data = input->data<T>();
+    T* output_data = output->data<T>();
+    int input_share_offset = C;
+    int output_share_offset = N * C * sample_size;
+    for (int nc = 0; nc < N * C; ++nc) {
+        int nc_offset = nc * sample_size;
+        std::fill(output_data + nc_offset, output_data + nc_offset + sample_size, *(input_data + nc % C));
+        std::fill(output_data + nc_offset + output_share_offset,
+                  output_data + nc_offset + output_share_offset + sample_size,
+                  *(input_data + nc % C + input_share_offset));
+    }
+}
+
+template <typename DeviceContext, typename T>
+void TransToChannelFirst(const Tensor* input, Tensor* output, const framework::ExecutionContext &ctx) {
+    // Transpose tensor
+    // input shape: {S, N, C, H, W}
+    // output shape: {C, S, N, H, W}
+    // H and W is optional
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto input_dims = input->dims();
+    switch (input_dims.size()) {
+        case 3: {
+            std::vector<int> axis{2, 0, 1};
+            output->mutable_data<T>({input_dims[2], input_dims[0], input_dims[1]}, ctx.GetPlace());
+            math::Transpose<DeviceContext, T, 3> trans3;
+            trans3(dev_ctx, *input, output, axis);
+            break;
+        }
+        case 4: {
+            std::vector<int> axis{2, 0, 1, 3};
+            output->mutable_data<T>({input_dims[2], input_dims[0], input_dims[1], input_dims[3]}, ctx.GetPlace());
+            math::Transpose<DeviceContext, T, 4> trans4;
+            trans4(dev_ctx, *input, output, axis);
+            break;
+        }
+        case 5: {
+            std::vector<int> axis{2, 0, 1, 3, 4};
+            output->mutable_data<T>({input_dims[2], input_dims[0], input_dims[1], input_dims[3], input_dims[4]},
+                                    ctx.GetPlace());
+            math::Transpose<DeviceContext, T, 5> trans5;
+            trans5(dev_ctx, *input, output, axis);
+            break;
+        }
+        default:
+            PADDLE_THROW("The size of input X's dimensions should be larger than 2, less than 6.");
+    }
+}
+
+template <typename DeviceContext, typename T>
+void ComputeSum(const Tensor* input, int C, Tensor* sum, const framework::ExecutionContext &ctx) {
+    // Compute sum of each channel
+    // input shape: {S, N, C, H, W}
+    // output shape: {S, C}
+    // H and W is optional, compute the sum of each channel.
+    Tensor input_trans;
+    TransToChannelFirst<DeviceContext, T>(input, &input_trans, ctx);
+    Tensor input_slice;
+    Tensor sum_slice;
+    auto sum_slice_data = sum_slice.mutable_data<T>(framework::make_ddim({2, 1}), ctx.GetPlace());
+    auto sum_data = sum->data<T>();
+    for (size_t i = 0; i < C; ++i) {
+        input_slice = input_trans.Slice(i, i + 1);
+        auto shape = paddle::framework::vectorize<size_t>(input_slice.dims());
+        shape.erase(shape.begin());
+        std::vector<int64_t> shape_(shape.cbegin(), shape.cend());
+        DDim dim(shape_.data(), shape_.size());
+        input_slice.Resize(dim);
+        mpc_operators->sum(&input_slice, &sum_slice);
+        sum_data[i] = sum_slice_data[0];
+        sum_data[i + C] = sum_slice_data[1];
+    }
+}
+
+
+template <typename DeviceContext, typename T>
+void ComputeMeanVariance(const Tensor* input, int S, int N, int C, int sample_size,
+                         Tensor* saved_mean_e, Tensor* saved_variance_e,
+                         const framework::ExecutionContext &ctx) {
+    // Compute mean and variance of each channel
+    // input shape: {S, N, C, H, W}
+    // output shape: {S, C}
+    // H and W is optional
+    VLOG(3) << "Compute the mean and variance of each channel";
+    Tensor input_trans;
+    TransToChannelFirst<DeviceContext, T>(input, &input_trans, ctx);
+
+    ComputeSum<DeviceContext, T>(input, C, saved_mean_e, ctx);
+    mpc_operators->scale(saved_mean_e, 1.0 / (N * sample_size), saved_mean_e); // scale
+
+    Tensor saved_mean_e_expand;
+    T* saved_mean_e_expand_data = saved_mean_e_expand.mutable_data<T>(input->dims(), ctx.GetPlace());
+    Expand<T>(saved_mean_e, &saved_mean_e_expand, S, N, C, sample_size);
+    mpc_operators->sub(input, &saved_mean_e_expand, &saved_mean_e_expand);
+    mpc_operators->mul(&saved_mean_e_expand, &saved_mean_e_expand, &saved_mean_e_expand);
+    ComputeSum<DeviceContext, T>(&saved_mean_e_expand, C, saved_variance_e, ctx);
+    mpc_operators->scale(saved_variance_e, 1.0 / (N * sample_size), saved_variance_e); // scale
+
+}
+
+template <typename DeviceContext, typename T>
+class MpcBatchNormKernel : public MpcOpKernel<T> {
+public:
+    void ComputeImpl(const framework::ExecutionContext &ctx) const override {
+
+        mpc_operators = mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators();
+        VLOG(3) << "Start MpcBatchNormKernel.";
+        const float epsilon = ctx.Attr<float>("epsilon");
+        float momentum = ctx.Attr<float>("momentum");
+        const bool is_test = ctx.Attr<bool>("is_test");
+        const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+        const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
+        bool test_mode = is_test && (!trainable_stats);
+
+        bool global_stats = test_mode || use_global_stats;
+
+        const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+        const DataLayout data_layout =
+            framework::StringToDataLayout(data_layout_str);
+
+        const Tensor *x = ctx.Input<Tensor>("X");
+        const DDim x_dims = x->dims();
+        PADDLE_ENFORCE_GE(
+            x_dims.size(), 3,
+            platform::errors::InvalidArgument(
+                "The size of input X's dimensions should be larger than 2."
+                "But received: the size of input X's dimensions is [%d]",
+                x_dims.size()));
+        PADDLE_ENFORCE_LE(
+            x_dims.size(), 6,
+            platform::errors::InvalidArgument(
+                "The size of input X's dimensions should be less than 6."
+                "But received: the size of input X's dimensionss is [%d]",
+                x_dims.size()));
+
+        const int S = 2; // share number
+        const int N = x_dims[1];
+        const int C = (data_layout == DataLayout::kNCHW ? x_dims[2] : x_dims[x_dims.size() - 1]);
+        const int sample_size = x->numel() / S / N / C;
+
+        auto *y = ctx.Output<Tensor>("Y");
+
+        auto *mean_out = ctx.Output<Tensor>("MeanOut");
+        auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+        auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+        auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+
+        // alloc memory
+        y->mutable_data<T>(ctx.GetPlace());
+        mean_out->mutable_data<T>(ctx.GetPlace());
+        variance_out->mutable_data<T>(ctx.GetPlace());
+        saved_mean->mutable_data<T>(ctx.GetPlace());
+        saved_variance->mutable_data<T>(ctx.GetPlace());
+
+        if (!global_stats) {
+            if ((N * sample_size) == 1) {
+                // Only 1 element in normalization dimension,
+                // we skip the batch norm calculation, let y = x.
+                framework::TensorCopy(*x, ctx.GetPlace(), y);
+                return;
+            }
+
+            // saved_xx is use just in this batch of data
+            // compute mean and variance
+            switch (data_layout) {
+                case DataLayout::kNCHW: {
+                    ComputeMeanVariance<DeviceContext, T>(x, S, N, C, sample_size, saved_mean, saved_variance, ctx);
+                    break;
+                }
+                default:
+                    PADDLE_THROW("Unknown storage order: %s", data_layout_str);
+            }
+
+            // updata global mean and variance, for prediction
+            if (ctx.HasInput("MomentumTensor")) {
+                const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
+                momentum = mom_tensor->data<float>()[0];
+            }
+
+            Tensor saved_mean_scale;
+            Tensor mean_out_scale;
+            saved_mean_scale.mutable_data<T>(saved_mean->dims(), ctx.GetPlace());
+            mean_out_scale.mutable_data<T>(mean_out->dims(), ctx.GetPlace());
+
+            mpc_operators->scale(mean_out, momentum, &mean_out_scale);
+            mpc_operators->scale(saved_mean, 1.0 - momentum, &saved_mean_scale);
+            mpc_operators->add(&mean_out_scale, &saved_mean_scale, mean_out);
+
+            mpc_operators->scale(variance_out, momentum, &mean_out_scale);
+            mpc_operators->scale(saved_variance, 1.0 - momentum, &saved_mean_scale);
+
+            mpc_operators->add(&mean_out_scale, &saved_mean_scale, variance_out);
+        }
+
+
+        // use SavedMean and SavedVariance to do normalize
+        // compute output y
+        Tensor inv_std;
+        Tensor mean_arr;
+        inv_std.mutable_data<T>({S, C}, ctx.GetPlace());
+
+        Tensor epsilon_expand;
+        T* epsilon_expand_data = epsilon_expand.mutable_data<int64_t>({S, C}, ctx.GetPlace());
+        std::fill(epsilon_expand_data, epsilon_expand_data + S * C, MPC_ONE_SHARE * epsilon); // todo
+
+        // inv_std = 1 / sqrt(variance + epsilon)
+        if (global_stats) {
+            const Tensor* variance = ctx.Input<Tensor>("Variance");
+            Tensor var_plus_epsilon;
+            var_plus_epsilon.mutable_data<T>({S, C}, ctx.GetPlace());
+
+            mpc_operators->add(variance, &epsilon_expand, &var_plus_epsilon);
+            mpc_operators->inverse_square_root(&var_plus_epsilon, &inv_std);
+
+            mean_arr.ShareDataWith(*ctx.Input<Tensor>("Mean"));
+        } else {
+            Tensor var_plus_epsilon;
+            var_plus_epsilon.mutable_data<T>({S, C}, ctx.GetPlace());
+            mpc_operators->add(saved_variance, &epsilon_expand, &var_plus_epsilon);
+            mpc_operators->inverse_square_root(&var_plus_epsilon, &inv_std);
+
+            mean_arr.ShareDataWith(*saved_mean);
+        }
+
+        //   ((x - est_mean) * (inv_var) * scale + bias
+        //   formula transform ====>
+        //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+        const auto *scale = ctx.Input<Tensor>("Scale");
+        const auto *bias = ctx.Input<Tensor>("Bias");
+        const T* scale_data = scale->data<T>();
+        const T* bias_data = bias->data<T>();
+
+        Tensor scale_expand;
+        auto* scale_expand_data = scale_expand.mutable_data<T>({S, C}, ctx.GetPlace());
+        std::fill(scale_expand_data, scale_expand_data + C, scale_data[0]);
+        std::fill(scale_expand_data + C, scale_expand_data + C + C, scale_data[1]);
+
+        Tensor bias_expand;
+        auto* bias_expand_data = bias_expand.mutable_data<T>({S, C}, ctx.GetPlace());
+        std::fill(bias_expand_data, bias_expand_data + C, bias_data[0]);
+        std::fill(bias_expand_data + C, bias_expand_data + C + C, bias_data[1]);
+
+        Tensor new_scale;
+        Tensor new_bias;
+        Tensor new_bias_tmp;
+        new_scale.mutable_data<T>(scale_expand.dims(), ctx.GetPlace());
+        new_bias.mutable_data<T>(scale_expand.dims(), ctx.GetPlace());
+        new_bias_tmp.mutable_data<T>(scale_expand.dims(), ctx.GetPlace());
+
+        mpc_operators->mul(&inv_std, &scale_expand, &new_scale);
+        mpc_operators->mul(&mean_arr, &new_scale, &new_bias_tmp);
+        mpc_operators->sub(&bias_expand, &new_bias_tmp, &new_bias);
+
+        switch (data_layout) {
+            case DataLayout::kNCHW: {
+                Tensor x_new_scale;
+                x_new_scale.mutable_data<T>(y->dims(), ctx.GetPlace());
+
+                Tensor new_scale_expand;
+                new_scale_expand.mutable_data<T>(x->dims(), ctx.GetPlace());
+                Expand<T>(&new_scale, &new_scale_expand, S, N, C, sample_size);
+
+                Tensor new_bias_expand;
+                new_bias_expand.mutable_data<T>(x->dims(), ctx.GetPlace());
+                Expand<T>(&new_bias, &new_bias_expand, S, N, C, sample_size);
+
+                mpc_operators->mul(x, &new_scale_expand, &x_new_scale);
+                mpc_operators->add(&x_new_scale, &new_bias_expand, y);
+                break;
+            }
+            default:
+                PADDLE_THROW("Unknown storage order: %d", data_layout);
+        }
+    }
+};
+
+
+template <typename DeviceContext, typename T>
+class MpcBatchNormGradKernel : public MpcOpKernel<T> {
+public:
+    void ComputeImpl(const framework::ExecutionContext &ctx) const override {
+
+        mpc_operators = mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators();
+        VLOG(3) << "Start MpcBatchNormGradKernel.";
+        const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+        const auto *scale = ctx.Input<Tensor>("Scale");
+        const auto *bias = ctx.Input<Tensor>("Bias");
+        const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+
+        // SavedVariance have been reverted in forward operator
+        const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
+        const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+        const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+        const bool is_test = ctx.Attr<bool>("is_test");
+        const float epsilon = ctx.Attr<float>("epsilon");
+        const DataLayout data_layout =
+            framework::StringToDataLayout(data_layout_str);
+
+        auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+        auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+        auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+        // batch_norm with inplace as false will take X as grad input, which
+        // is same as cuDNN batch_norm backward calculation, batch_norm
+        // with inplace as true only take Y as input and X should be calculate
+        // by inverse operation of batch_norm on Y
+        const Tensor *x;
+        x = ctx.Input<Tensor>("X");
+
+        PADDLE_ENFORCE_EQ(
+            is_test, false,
+            platform::errors::InvalidArgument(
+                "`is_test = True` CANNOT be used in train program. If "
+                "you want to use global status in pre_train model, "
+                "please set `use_global_stats = True`"));
+
+        // Get the size for each dimension.
+        // NCHW [batch_size, in_channels, in_height, in_width]
+        const auto &x_dims = x->dims();
+        PADDLE_ENFORCE_GE(
+            x_dims.size(), 3,
+            platform::errors::InvalidArgument(
+                "The size of input X's dimensions should be larger than 2."
+                "But received: the size of input X's dimensions is [%d]",
+                x_dims.size()));
+        PADDLE_ENFORCE_LE(
+            x_dims.size(), 6,
+            platform::errors::InvalidArgument(
+                "The size of input X's dimensions should be less than 6."
+                "But received: the size of input X's dimensionss is [%d]",
+                x_dims.size()));
+        const int S = 2; // share number
+        const int N = x_dims[1];
+        const int C = (data_layout == DataLayout::kNCHW ? x_dims[2] : x_dims[x_dims.size() - 1]);
+        const int sample_size = x->numel() / S / N / C;
+
+        d_x->mutable_data<T>(ctx.GetPlace());
+
+        const T *mean_data = saved_mean->data<T>();
+        Tensor inv_var_tensor;
+        inv_var_tensor.ShareDataWith(*saved_inv_variance); // local variance
+
+        // update mean_data, compute inv_var = 1 / sqrt(variance + epsilon)
+        if (use_global_stats) {
+            const auto *running_mean = ctx.Input<Tensor>("Mean");
+            const auto *running_variance = ctx.Input<Tensor>("Variance");
+            mean_data = running_mean->data<T>();
+
+            Tensor inv_var_tmp;
+            inv_var_tmp.Resize({S, C});
+
+            Tensor var_plus_epsilon;
+            var_plus_epsilon.mutable_data<T>(running_variance->dims(), ctx.GetPlace());
+
+            Tensor epsilon_expand;
+            T* epsilon_expand_data = epsilon_expand.mutable_data<T>({S, C}, ctx.GetPlace());
+            std::fill(epsilon_expand_data, epsilon_expand_data + S * C, MPC_ONE_SHARE * epsilon);
+
+            mpc_operators->add(running_variance, &epsilon_expand,  &var_plus_epsilon);
+            mpc_operators->inverse_square_root(&var_plus_epsilon, &inv_var_tmp);
+            framework::TensorCopy(inv_var_tmp, ctx.GetPlace(), &inv_var_tensor);
+        }
+
+
+        if (d_scale && d_bias) {
+            d_scale->mutable_data<T>(ctx.GetPlace());
+            d_bias->mutable_data<T>(ctx.GetPlace());
+        }
+
+        // d_bias = np.sum(d_y, axis=0)
+        // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
+        if ((N * sample_size) == 1 && !use_global_stats) {
+            framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+            return;
+        }
+
+
+        switch (data_layout) {
+            case DataLayout::kNCHW: {
+                // d_bias = np.sum(d_y, axis=0)
+                Tensor dy_sum;
+                dy_sum.Resize({S, C});
+                dy_sum.mutable_data<T>(ctx.GetPlace());
+
+                ComputeSum<DeviceContext, T>(d_y, C, &dy_sum, ctx); // dy_sum
+
+                // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
+                // = [np.sum(X * dy) - mean * dy_sum] * inv_std
+                Tensor x_mul_dy;
+                x_mul_dy.mutable_data<T>(x->dims(), ctx.GetPlace());
+                const DDim d_y_dim = d_y->dims();
+                mpc_operators->mul(x, d_y, &x_mul_dy); // X * dy
+
+                Tensor dy_mul_x_sub_mean_mul_invstd_sum;
+                dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>({S, C}, ctx.GetPlace());
+                ComputeSum<DeviceContext, T>(&x_mul_dy, C, &dy_mul_x_sub_mean_mul_invstd_sum, ctx); // sum(X * dy)
+
+                Tensor dy_sum_mul_mean;
+                dy_sum_mul_mean.mutable_data<T>({S, C}, ctx.GetPlace());
+                mpc_operators->mul(&dy_sum, saved_mean, &dy_sum_mul_mean); // mean * dy_sum
+
+                Tensor tmp;
+                tmp.mutable_data<T>({S, C}, ctx.GetPlace());
+                // [np.sum(X * dy) - mean * dy_sum]
+                mpc_operators->sub(&dy_mul_x_sub_mean_mul_invstd_sum, &dy_sum_mul_mean, &tmp);
+                // [np.sum(X * dy) - mean * dy_sum] * inv_std
+                mpc_operators->mul(&tmp, saved_inv_variance, &dy_mul_x_sub_mean_mul_invstd_sum);
+
+
+                if (d_scale && d_bias) {
+                    framework::TensorCopy(dy_sum, ctx.GetPlace(), d_bias);
+                    framework::TensorCopy(dy_mul_x_sub_mean_mul_invstd_sum, ctx.GetPlace(), d_scale);
+                }
+
+                // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
+                // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+                int scale_coefff = use_global_stats ? 1 : N * sample_size;
+
+                Tensor scale_inv_var_nhw;
+                T* scale_inv_var_nhw_data = scale_inv_var_nhw.mutable_data<T>({S, C}, ctx.GetPlace());
+                // scale * inv_var
+                mpc_operators->mul(scale, saved_inv_variance, &scale_inv_var_nhw);
+                // (1. / N) * scale * inv_var
+                mpc_operators->scale(&scale_inv_var_nhw, 1.0 / scale_coefff, &scale_inv_var_nhw);
+                Tensor scale_inv_var_nhw_expand;
+                scale_inv_var_nhw_expand.mutable_data<T>(d_y_dim, ctx.GetPlace());
+                Expand<T>(&scale_inv_var_nhw, &scale_inv_var_nhw_expand, S, N, C, sample_size);
+
+                if (!use_global_stats) {
+                    Tensor dy_scale;
+                    dy_scale.mutable_data<T>(d_y_dim, ctx.GetPlace());
+                    // N * dy
+                    mpc_operators->scale(d_y, N * sample_size, &dy_scale);
+
+                    Tensor dy_sum_expand;
+                    dy_sum_expand.mutable_data<T>(d_y_dim, ctx.GetPlace());
+                    Expand<T>(&dy_sum, &dy_sum_expand, S, N, C, sample_size);
+
+                    Tensor dy_scale_minus_dy;
+                    dy_scale_minus_dy.mutable_data<T>(d_y_dim, ctx.GetPlace());
+                    // N * dy - np.sum(d_y, axis=0)
+                    mpc_operators->sub(&dy_scale, &dy_sum_expand, &dy_scale_minus_dy);
+
+                    Tensor mean_expand;
+                    mean_expand.mutable_data<T>(d_y_dim, ctx.GetPlace());
+                    Expand<T>(saved_mean, &mean_expand, S, N, C, sample_size);
+
+                    Tensor x_minus_mean;
+                    x_minus_mean.mutable_data<T>(d_y_dim, ctx.GetPlace());
+                    // (X - mean)
+                    mpc_operators->sub(x, &mean_expand, &x_minus_mean);
+                    //  inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+                    mpc_operators->mul(&dy_mul_x_sub_mean_mul_invstd_sum, saved_inv_variance, &tmp);
+
+                    Tensor tmp_expand;
+                    tmp_expand.mutable_data<T>(d_y_dim, ctx.GetPlace());
+                    Expand<T>(&tmp, &tmp_expand, S, N, C, sample_size);
+
+                    Tensor tmp_expand2;
+                    tmp_expand2.mutable_data<T>(d_y_dim, ctx.GetPlace());
+                    // (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)
+                    mpc_operators->mul(&tmp_expand, &x_minus_mean, &tmp_expand2);
+                    mpc_operators->sub(&dy_scale_minus_dy, &tmp_expand2, &dy_scale);
+                    mpc_operators->mul(&scale_inv_var_nhw_expand, &dy_scale, d_x);
+                } else {
+                    mpc_operators->mul(&scale_inv_var_nhw_expand, d_y, d_x);
+                }
+                break;
+            }
+            default:
+                PADDLE_THROW("Unknown storage order: %s", data_layout_str);
+        } // switch
+    } // void ComputeImpl
+}; // class MpcBatchNormGradKernel
+
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/mpc_elementwise_add_op.h
+++ b/core/paddlefl_mpc/operators/mpc_elementwise_add_op.h
@@ -69,6 +69,119 @@ private:
    int64_t n_;
 };

+template <typename T, typename DeviceContext>
+class MidWiseTransformIterator;
+
+template <typename T>
+class MidWiseTransformIterator<T, platform::CPUDeviceContext>
+    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
+                           T *, T &> {
+ public:
+  MidWiseTransformIterator(const T *ptr, int n, int post)
+      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
+
+  MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator++() {
+    ++j_;
+    if (UNLIKELY(j_ == post_)) {
+      ++i_;
+      j_ = 0;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
+    }
+    return *this;
+  }
+
+  MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
+    while (n-- > 0) {
+      ++j_;
+      if (UNLIKELY(j_ == post_)) {
+        ++i_;
+        j_ = 0;
+        if (UNLIKELY(i_ == n_)) {
+          i_ = 0;
+        }
+      }
+    }
+    return *this;
+  }
+
+  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
+                      &rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
+                      &rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T &operator*() { return ptr_[i_]; }
+
+ private:
+  const T *ptr_;
+  int64_t i_;
+  int64_t j_;
+  int64_t n_;
+  int64_t post_;
+};
+
+template <typename Functor, typename T, typename DeviceContext,
+          typename OutType = T>
+class TransformFunctor {
+ public:
+  TransformFunctor(const framework::Tensor *x, const framework::Tensor *y,
+                   framework::Tensor *z, const DeviceContext &ctx, Functor func,
+                   const bool is_xsize_larger = true)
+      : x_(x->data<T>()),
+        y_(y->data<T>()),
+        z_(z->mutable_data<OutType>(ctx.GetPlace())),
+        nx_(x->numel()),
+        ctx_(ctx),
+        func_(func),
+        is_xsize_larger_(is_xsize_larger) {
+    if (is_xsize_larger_ == false) {
+      nx_ = y->numel();
+    }
+  }
+
+  inline void Run() const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
+  }
+
+  inline void RunRowWise(int n, int pre) const {
+    platform::Transform<DeviceContext> trans;
+    if (is_xsize_larger_) {
+      trans(ctx_, x_, x_ + nx_,
+            RowwiseTransformIterator<T, DeviceContext>(y_, n), z_, func_);
+    } else {
+      trans(ctx_, y_, y_ + nx_,
+            RowwiseTransformIterator<T, DeviceContext>(x_, n), z_, func_);
+    }
+  }
+
+  inline void RunMidWise(int n, int pre, int post) const {
+    platform::Transform<DeviceContext> trans;
+    if (is_xsize_larger_) {
+      trans(ctx_, x_, x_ + nx_,
+            MidWiseTransformIterator<T, DeviceContext>(y_, n, post), z_, func_);
+    } else {
+      trans(ctx_, y_, y_ + nx_,
+            MidWiseTransformIterator<T, DeviceContext>(x_, n, post), z_, func_);
+    }
+  }
+
+ private:
+  const T *x_;
+  const T *y_;
+  OutType *z_;
+  int64_t nx_;
+  const DeviceContext &ctx_;
+  Functor func_;
+  bool is_xsize_larger_;
+};
+
 template <typename T>
 struct AddFunctor {
    inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
@@ -123,27 +236,34 @@ public:
            in_y_t_slice = in_y_t->Slice(i, i + 1);
            out_t_slice = out_t->Slice(i, i + 1);

+
            auto x_dims = in_x_t_slice.dims();
            auto y_dims = in_y_t_slice.dims();

            axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+
            PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                           "Axis should be in range [0, x_dims)");

            int pre, n, post;
            GetMidDims get_mid_dims;
            get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
-                PADDLE_ENFORCE_EQ(post, 1, 
-                                  "post should be equal 1, but received post is [%s]", post);

            auto x_ = in_x_t_slice.data<T>();
            auto y_ = in_y_t_slice.data<T>();
            auto out_ = out_t_slice.data<T>();
            auto nx_ = in_x_t_slice.numel();
+
            paddle::platform::Transform<DeviceContext> trans;
+            if (post == 1) {
                trans(ctx.template device_context<DeviceContext>(), x_, x_ + nx_, 
                    RowwiseTransformIterator<T, DeviceContext>(y_, n),
                    out_, AddFunctor<T>());
+            } else {
+                trans(ctx.template device_context<DeviceContext>(), x_, x_ + nx_, 
+                    MidWiseTransformIterator<T, DeviceContext>(y_, n, post),
+                    out_, AddFunctor<T>());
+            }
        }
      }
  }
@@ -185,17 +305,15 @@ public:
                int pre, n, post;
                GetMidDims get_mid_dims;
                get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
-                PADDLE_ENFORCE_EQ(post, 1, 
-                                  "post should be equal 1, but received post is [%s]", post);
+
+                std::fill(dy_data, dy_data + dy->numel(), static_cast<T>(0));

                for (size_t i = 0; i < SHARE_NUM; ++i) {
                    int y_offset = i * n;
                    for (size_t j = 0; j < pre; ++j) {
                        for (size_t k = 0; k < n; ++k) {
-                            int out_offset = i * pre * n + j * n + k;
-                            if (0 == j) {
-                                dy_data[k + y_offset] = dout_data[out_offset];
-                            } else {
+                            for (size_t m = 0; m < post; ++m) {
+                                int out_offset = i * pre * n * post + j * n * post + k * post + m;
                                dy_data[k + y_offset] += dout_data[out_offset];
                            }
                        }

--- a/core/paddlefl_mpc/operators/mpc_gru_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_gru_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "mpc_gru_op.h"
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "mpc_op.h"
+
+#include <memory>
+#include <string>
+#include "core/paddlefl_mpc/operators/math/math_function.h"
+
+namespace paddle
+{
+namespace operators
+{
+
+using framework::DDim;
+using framework::Tensor;
+using framework::LoD;
+
+class MpcGRUOp : public framework::OperatorWithKernel
+{
+public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+
+    void InferShape(framework::InferShapeContext *ctx) const override
+    {
+        PADDLE_ENFORCE(ctx->HasInput("Input"),
+                       "Input(%s) of MpcGRUOp should not be null.", "Input");
+        PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                       "Input(%s) of MpcGRUOp should not be null.", "Weight");
+        PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                       "Output(%s) of MpcGRUOp should not be null.", "BatchGate");
+        PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
+                       "Output(%s) of MpcGRUOp should not be null.",
+                       "BatchResetHiddenPrev");
+        PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                       "Output(%s) of MpcGRUOp should not be null.", "BatchHidden");
+        PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                       "Output(%s) of MpcGRUOp should not be null.", "Hidden");
+        auto input_dims_trans = ctx->GetInputDim("Input");
+        auto input_dims = framework::make_ddim({input_dims_trans[1],
+                                                input_dims_trans[0], input_dims_trans[2]});
+        auto weight_dims = ctx->GetInputDim("Weight");
+        int input_size = input_dims[2];
+        int frame_size = weight_dims[1];
+        if (ctx->IsRuntime())
+        {
+            PADDLE_ENFORCE_EQ(
+                input_size, frame_size * 3,
+                "The input_size must be 3 times of frame_size in MpcGRUOp.");
+        }
+        PADDLE_ENFORCE_EQ(
+            weight_dims[2], frame_size * 3,
+            "The shape of mpc Weight matrix must be [frame_size, frame_size * 3].");
+        if (ctx->HasInput("H0"))
+        {
+            auto h0_dims = ctx->GetInputDim("H0");
+            PADDLE_ENFORCE_EQ(h0_dims[2], frame_size,
+                              "The width of H0 must be equal to frame_size.");
+        }
+        if (ctx->HasInput("Bias"))
+        {
+            auto bias_dims = ctx->GetInputDim("Bias");
+            int bias_height = bias_dims[1];
+            int bias_width = bias_dims[2];
+            PADDLE_ENFORCE_EQ(bias_height, 1,
+                              "The shape of Bias must be [1, frame_size * 3].");
+            PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                              "The shape of Bias must be [1, frame_size * 3].");
+        }
+        ctx->SetOutputDim("BatchGate", input_dims);
+        ctx->SetOutputDim("BatchResetHiddenPrev", {2, input_dims[1], frame_size});
+        ctx->SetOutputDim("BatchHidden", {2, input_dims[1], frame_size});
+        ctx->SetOutputDim("Hidden", {2, input_dims[1], frame_size});
+        ctx->ShareLoD("Input", "Hidden");
+    }
+};
+
+class MpcGRUOpMaker : public framework::OpProtoAndCheckerMaker
+{
+public:
+    void Make() override
+    {
+        AddInput("Input",
+                 "(LoDTensor) The first input is a LodTensor, which supports "
+                 "variable-time length input sequence. The underlying tensor in "
+                 "this LoDTenosr is a matrix with shape (T x 2 x 3D), where, T is the "
+                 "total time steps in this mini-batch, D is the hidden size."
+                 "Note: before call this OP, "
+                 "Yout must transpose input shape of mini-batch dim to first dim,"
+                 "that is, (2, T, 3D) is transpose to (T, 2, 3D), "
+                 "so that its lod information of shares can be set correctly");
+        AddInput("H0",
+                 "(Tensor, optional) The initial hidden state is an optional "
+                 "input. This is a tensor with shape (2 x N x D), where N is the "
+                 "batch size, D is the hidden size.")
+        .AsDispensable();
+        AddInput(
+            "Weight",
+            "(Tensor) The learnable hidden-hidden weight matrix with shape "
+            "(2 x D x 3D), where D is the hidden size. The elements continuous in "
+            "memory can be divided into two parts. The first part are weights of "
+            "the update gate and reset gate with shape (2 x D x 2D), and the second "
+            "part are weights of output candidate with shape (2 x D x D).");
+        AddInput("Bias",
+                 "(Tensor, optional) Bias vector with shape (2 x 1 x 3D) concating "
+                 "bias of the update gate, reset gate and output candidate.")
+        .AsDispensable();
+        AddOutput("BatchGate",
+                  "(LoDTensor) To compute with batches, sequence data will be "
+                  "reorganized into several successive batches each containing "
+                  "data from the same time step. The LoDTensor BatchGate contains "
+                  "the update gate, reset gate and output candidate values "
+                  "organized in batches. The LoD size is 2. The first LoD contains "
+                  "the batch offsets and the second LoD contains the indexes in "
+                  "the raw sequence data.")
+        .AsIntermediate();
+        AddOutput(
+            "BatchResetHiddenPrev",
+            "(LoDTensor) The reset hidden state LoDTensor organized in batches. "
+            "This LoDTensor is a matrix with shape (2 x T x D) and has the same LoD "
+            "with `BatchGate`.")
+        .AsIntermediate();
+        AddOutput(
+            "BatchHidden",
+            "(LoDTensor) The hidden state LoDTensor organized in batches.  "
+            "This LoDTensor is a matrix with shape (2 x T x D) and has the same LoD "
+            "with `BatchGate`.")
+        .AsIntermediate();
+        AddOutput(
+            "Hidden",
+            "(LoDTensor) the hidden state LoDTensor organized in sequences. "
+            "This LoDTensor is a matrix with shape (2 x T x D) and has the same LoD "
+            "with `BatchGate`.");
+        AddAttr<std::string>("activation",
+                             "(string, default tanh) "
+                             "The activation type used for output candidate {h}_t.")
+        .SetDefault("relu");
+        AddAttr<std::string>(
+            "gate_activation",
+            "(string, default sigmoid) "
+            "The activation type used in update gate and reset gate.")
+        .SetDefault("sigmoid");
+        AddAttr<bool>("is_reverse",
+                      "(bool, default: False) "
+                      "whether to compute reversed GRU.")
+        .SetDefault(false);
+        AddAttr<bool>("origin_mode",
+                      "bool"
+                      "use origin mode in article https://arxiv.org/abs/1412.3555")
+        .SetDefault(false);
+        AddComment(R"DOC(
+GRU Operator implements part calculations of the complete GRU as following:
+
+$$
+update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+$$
+
+@note To implement the complete GRU, fully-connected operator must be used
+before to feed xu, xr and xc as the Input of GRU operator.
+)DOC");
+  }
+};
+
+class MpcGRUGradOp : public framework::OperatorWithKernel
+{
+public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override
+  {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of MpcGRUGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of MpcGRUGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(%s) of MpcGRUGradOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
+                   "Input(%s) of MpcGRUGradOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
+                   "Input(%s) of MpcGRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of MpcGRUGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of MpcGRUGradOp should not be null.", "Hidden");
+    auto input_dims_trans = ctx->GetInputDim("Input");
+    auto input_dims = framework::make_ddim({input_dims_trans[1],
+                            input_dims_trans[0], input_dims_trans[2]});
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[2];
+    int frame_size = weight_dims[1];
+    int weight_height = weight_dims[1];
+    int weight_width = weight_dims[2];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in MpcGRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0"))
+    {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[2], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+      auto h0_grad_name = framework::GradVarName("H0");
+      if (ctx->HasOutput(h0_grad_name))
+        ctx->SetOutputDim(h0_grad_name, h0_dims);
+    }
+    if (ctx->HasInput("Bias"))
+    {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[1];
+      int bias_width = bias_dims[2];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      //transpose input's shape
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override
+  {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Hidden")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class MpcGRUCPUKernel : public MpcOpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    bool origin_mode = context.Attr<bool>("origin_mode");
+    auto* input_trans = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    auto hidden_dims = hidden->dims();
+    const auto place = context.GetPlace();
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    // get input lod
+    auto input_lod = input_trans->lod();
+    LoD gate_lod;
+    // transpose input to corrected mpc_input
+    // (T, 2, 3D) to (2, T, 3D)
+    math::Transpose<DeviceContext, T, 3> transpose;
+    Tensor input;
+    auto input_dim = input_trans->dims();
+    auto in_dim = framework::make_ddim({input_dim[1], input_dim[0], input_dim[2]});
+    input.mutable_data<T>(
+        in_dim,
+        context.GetPlace());
+    transpose(dev_ctx, *input_trans, &input, {1, 0, 2});
+
+    for (int i = 0; i < 2; ++i) {
+        // mpc LoDTensor to Batch
+        Tensor input_s;
+        Tensor batch_gate_s;
+        SliceAndReshape(&input, input_s, i);
+        SliceAndReshape(batch_gate, batch_gate_s, i);
+        LoDTensor lod_input_s;
+        LoDTensor lod_batch_gate_s;
+        lod_input_s.ShareBufferWith(input_s);
+        lod_input_s.mutable_data<T>(input_s.dims(), place);
+        lod_batch_gate_s.ShareBufferWith(batch_gate_s);
+        lod_batch_gate_s.mutable_data<T>(batch_gate_s.dims(), place);
+        lod_input_s.set_lod(input_lod);
+        to_batch(dev_ctx, lod_input_s, &lod_batch_gate_s, true, is_reverse);
+        gate_lod = lod_batch_gate_s.lod();
+    }
+
+    if (bias) {
+        // add mpc bias
+        math::RowwiseAdd<DeviceContext, T> add_bias;
+        for (int i = 0; i < 2; ++i) {
+            Tensor batch_gate_s;
+            Tensor bias_s;
+            SliceAndReshape(batch_gate, batch_gate_s, i);
+            SliceAndReshape(bias, bias_s, i);
+            add_bias(dev_ctx, batch_gate_s, bias_s, &batch_gate_s);
+        }
+    }
+    // split mpc weight from shape (2, D, 3D) to 3 * (2, D, D)
+    std::vector<Tensor> mpc_splitted_weights_t;
+    //Split3Dim<DeviceContext, T>(context, &mpc_splitted_weights_t, *weight);
+    SplitWeight<DeviceContext, T>(context, mpc_splitted_weights_t, *weight);
+
+    Tensor ordered_h0;
+    framework::Vector<size_t> order((gate_lod)[2]);
+    Tensor mpc_hidden_prev_t;
+    bool has_hidden_prev = false;
+
+    if (h0) {
+      // reordered h0 based on lod
+      ordered_h0.Resize(h0->dims());
+      for (int i = 0; i < 2; ++i) {
+          Tensor h0_s;
+          Tensor ordered_h0_s;
+          SliceAndReshape(h0, h0_s, i);
+          SliceAndReshape(&ordered_h0, ordered_h0_s, i);
+          ReorderInitState<DeviceContext, T>(
+                context.template device_context<DeviceContext>(), h0_s, order,
+                &ordered_h0_s, true);
+      }
+      // copy ordered_h0 to mpc_hidden_prev_t
+      mpc_hidden_prev_t = ordered_h0;
+      has_hidden_prev = true;
+    }
+    auto batch_starts = (gate_lod)[0];
+    size_t seq_len = batch_starts.size() - 1;
+
+    std::vector<Tensor> mpc_gate_t_list;
+    std::vector<Tensor> mpc_reset_hidden_prev_t_list;
+    std::vector<Tensor> mpc_hidden_t_list;
+    // compute gru
+    for (size_t n = 0; n < seq_len; n++) {
+        int bstart = static_cast<int>(batch_starts[n]);
+        int bend = static_cast<int>(batch_starts[n + 1]);
+        int cur_batch_size = bend - bstart;
+
+        std::vector<Tensor> mpc_splitted_gate_t;
+        Tensor mpc_batch_gate_t;
+        Tensor mpc_reset_hidden_prev_t;
+        Tensor mpc_hidden_t;
+
+        ToMpcBatchTensor<DeviceContext, T>(context, mpc_batch_gate_t, *batch_gate, bstart, bend);
+        Split3Dim<DeviceContext, T>(context, mpc_splitted_gate_t, mpc_batch_gate_t);
+        ToMpcBatchTensor<DeviceContext, T>(context, mpc_reset_hidden_prev_t, *batch_reset_hidden_prev, bstart, bend);
+        ToMpcBatchTensor<DeviceContext, T>(context, mpc_hidden_t, *batch_hidden, bstart, bend);
+
+        ComputGRUUint<DeviceContext, T>(context, mpc_splitted_gate_t, mpc_splitted_weights_t, mpc_reset_hidden_prev_t,
+                    mpc_hidden_t, mpc_hidden_prev_t, origin_mode, has_hidden_prev);
+
+        Tensor mpc_gate_t;
+        Concat3Dim<DeviceContext, T>(context, &mpc_gate_t, mpc_splitted_gate_t);
+        //mpc_hidden_prev_t = mpc_hidden_t;
+        mpc_hidden_prev_t.mutable_data<T>(mpc_hidden_t.dims(), place);
+        framework::TensorCopy(mpc_hidden_t, context.GetPlace(), &mpc_hidden_prev_t);
+        mpc_gate_t_list.emplace_back(mpc_gate_t);
+        mpc_reset_hidden_prev_t_list.emplace_back(mpc_reset_hidden_prev_t);
+        mpc_hidden_t_list.emplace_back(mpc_hidden_t);
+    }
+    // Concat output variables
+    ConcatBatchAll<DeviceContext, T>(context, batch_gate, mpc_gate_t_list);
+    ConcatBatchAll<DeviceContext, T>(context, batch_reset_hidden_prev, mpc_reset_hidden_prev_t_list);
+    ConcatBatchAll<DeviceContext, T>(context, batch_hidden, mpc_hidden_t_list);
+    // mpc batch tensor to mpc LoDTensor
+    for (int i = 0; i < 2; ++i)
+    {
+        Tensor batch_hidden_s;
+        SliceAndReshape(batch_hidden, batch_hidden_s, i);
+        Tensor hidden_s;
+        SliceAndReshape(hidden, hidden_s, i);
+        LoDTensor lod_batch_hidden_s;
+        LoDTensor lod_hidden_s;
+
+        lod_batch_hidden_s.ShareBufferWith(batch_hidden_s);
+        lod_batch_hidden_s.mutable_data<T>(batch_hidden_s.dims(), place);
+        lod_hidden_s.ShareBufferWith(hidden_s);
+        lod_hidden_s.mutable_data<T>(hidden_s.dims(), place);
+        math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+        lod_batch_hidden_s.set_lod(gate_lod);
+        lod_hidden_s.set_lod(gate_lod);
+        to_seq(dev_ctx, lod_batch_hidden_s, &lod_hidden_s);
+    }
+    // set batch_gate_lod for grad op
+    batch_gate->set_lod(gate_lod);
+  }
+
+  void ComputeImpl(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+template <typename T>
+class MpcGRUGradOpMaker : public framework::SingleGradOpMaker<T>
+{
+public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+protected:
+  void Apply(GradOpPtr<T> grad_op) const override
+  {
+    grad_op->SetType("mpc_gru_grad");
+    grad_op->SetInput("Input", this->Input("Input"));
+    grad_op->SetInput("H0", this->Input("H0"));
+    grad_op->SetInput("Bias", this->Input("Bias"));
+    grad_op->SetInput("Weight", this->Input("Weight"));
+
+    grad_op->SetInput("BatchGate", this->Output("BatchGate"));
+    grad_op->SetInput("BatchResetHiddenPrev",
+                      this->Output("BatchResetHiddenPrev"));
+    grad_op->SetInput("BatchHidden", this->Output("BatchHidden"));
+    grad_op->SetInput("Hidden", this->Output("Hidden"));
+
+    grad_op->SetInput(framework::GradVarName("Hidden"),
+                      this->OutputGrad("Hidden"));
+
+    grad_op->SetOutput(framework::GradVarName("H0"), this->InputGrad("H0"));
+    grad_op->SetOutput(framework::GradVarName("Input"),
+                       this->InputGrad("Input"));
+    grad_op->SetOutput(framework::GradVarName("Weight"),
+                       this->InputGrad("Weight"));
+    grad_op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(MpcGRUGradOpNoNeedBufferVarInference, "Input",
+                                    "Bias");
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mpc_gru, ops::MpcGRUOp, ops::MpcGRUOpMaker,
+                  ops::MpcGRUGradOpMaker<paddle::framework::OpDesc>,
+                  ops::MpcGRUGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(mpc_gru_grad, ops::MpcGRUGradOp,
+                  ops::MpcGRUGradOpNoNeedBufferVarInference);
+REGISTER_OP_CPU_KERNEL(mpc_gru, ops::MpcGRUCPUKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    mpc_gru_grad, ops::MpcGRUGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/core/paddlefl_mpc/operators/mpc_gru_op.h
+++ b/core/paddlefl_mpc/operators/mpc_gru_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <functional>
+#include <glog/logging.h>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "core/paddlefl_mpc/operators/math/sequence2batch.h"
+#include "core/paddlefl_mpc/operators/math/concat_and_split.h"
+#include "core/paddlefl_mpc/operators/math/math_function.h"
+#include "mpc_op.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+typedef std::function<void(const Tensor*, Tensor*)> GateActivation;
+
+template<typename T>
+inline void ComputeSigmoidGrad(const framework::ExecutionContext& context,
+                               Tensor& dy, Tensor& y, Tensor& dx);
+template<typename DeviceContext, typename T>
+inline void BackwardStateGrad(const framework::ExecutionContext& context,
+                              std::vector<Tensor>& mpc_splitted_gate_t,
+                              std::vector<Tensor>& mpc_splitted_gate_grad_t,
+                              Tensor& mpc_hidden_prev_t, Tensor& mpc_hidden_prev_grad_t,
+                              Tensor& mpc_hidden_grad_t,
+                              bool origin_mode, bool has_hidden_prev,
+                              bool has_hidden_prev_grad);
+
+template<typename DeviceContext, typename T>
+inline void BackwarsResetGrad(const framework::ExecutionContext& context,
+                              std::vector<Tensor>& mpc_splitted_gate_t,
+                              std::vector<Tensor>& mpc_splitted_gate_grad_t,
+                              Tensor& mpc_hidden_prev_t, Tensor& mpc_hidden_prev_grad_t,
+                              Tensor& mpc_reset_hidden_prev_grad_t,
+                              bool has_hidden_prev, bool has_hidden_prev_grad);
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
+                             framework::Tensor* dst, bool indexed_src) {
+    math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+    dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+    row_shuffle(ctx, src, index_lod, dst, indexed_src);
+}
+
+template<typename DeviceContext, typename T>
+inline void ComputGRUUint(const framework::ExecutionContext& context,
+                          std::vector<Tensor>& gate_t,
+                          std::vector<Tensor>& weight_t,
+                          Tensor &reset_hidden_prev_t,
+                          Tensor &hidden_t,
+                          Tensor &hidden_prev_t,
+                          bool origin_mode,
+                          bool& has_hidden_prev) {
+    // compute GRUUnit
+    Tensor u_h_t;
+    Tensor r_h_t;
+    // gate_t[x] shape (2, B, D)
+    // weight_t[x] shape (2, D, D)
+    // hidden_prev_t shape (2, B, D)
+    // hidden_t shape (2, B, D)
+    u_h_t.mutable_data<T>(gate_t[0].dims(), context.GetPlace());
+    r_h_t.mutable_data<T>(gate_t[1].dims(), context.GetPlace());
+    auto mpc_operator = mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators();
+    if (has_hidden_prev) {
+        // compute update gate and reset gate: gate_t += hidden_prev_t matmul gate_weight
+        mpc_operator->matmul(&hidden_prev_t, &weight_t[0], &u_h_t);
+        mpc_operator->add(&u_h_t, &gate_t[0], &gate_t[0]);
+
+        mpc_operator->matmul(&hidden_prev_t, &weight_t[1], &r_h_t);
+        mpc_operator->add(&r_h_t, &gate_t[1], &gate_t[1]);
+    }
+
+    auto GateActProcess = [&gate_t](const GateActivation fun) {
+        fun(&gate_t[0], &gate_t[0]);
+        fun(&gate_t[1], &gate_t[1]);
+    };
+    GateActivation activ_functor;
+    std::string active_gate = context.Attr<std::string>("gate_activation");
+    if (active_gate == "sigmoid_chebyshev") {
+        activ_functor = std::bind(&paddle::mpc::MpcOperators::sigmoid_chebyshev,
+                                  mpc_operator.get(),
+                                  std::placeholders::_1,
+                                  std::placeholders::_2);
+    } else if (active_gate == "sigmoid") {
+        activ_functor = std::bind(&paddle::mpc::MpcOperators::sigmoid,
+                                  mpc_operator.get(),
+                                  std::placeholders::_1,
+                                  std::placeholders::_2);
+    } else if (active_gate == "sigmoid_enhanced") {
+        activ_functor = std::bind(&paddle::mpc::MpcOperators::sigmoid_enhanced,
+                                  mpc_operator.get(),
+                                  std::placeholders::_1,
+                                  std::placeholders::_2);
+    } else {
+        PADDLE_THROW("gate activation of %s is not implemented yet.", active_gate);
+    }
+    GateActProcess(activ_functor);
+
+    if (has_hidden_prev) {
+        // reset_hidden_prev_t = gate[1] * hidden_prev_t
+        // compute candidate gate: gate_t[2] += reset_hidden_prev_t matmul state_weight
+        Tensor r_h_tmp;
+        r_h_tmp.mutable_data<T>(gate_t[2].dims(), context.GetPlace());
+        mpc_operator->mul(&gate_t[1], &hidden_prev_t, &reset_hidden_prev_t);
+        mpc_operator->matmul(&reset_hidden_prev_t, &weight_t[2], &r_h_tmp);
+        mpc_operator->add(&r_h_tmp, &gate_t[2], &gate_t[2]);
+    } else {
+        //initialize reset_hidden_prev_t and hidden_prev_t as 0
+        math::SetConstant<DeviceContext, T> zero;
+        auto& dev_ctx = context.template device_context<DeviceContext>();
+        reset_hidden_prev_t.mutable_data<T>(gate_t[0].dims(), context.GetPlace());
+        hidden_prev_t.mutable_data<T>(gate_t[0].dims(), context.GetPlace());
+        zero(dev_ctx, &reset_hidden_prev_t, static_cast<T>(0));
+        zero(dev_ctx, &hidden_prev_t, static_cast<T>(0));
+        has_hidden_prev = true;
+    }
+
+    mpc_operator->relu(&gate_t[2], &gate_t[2]);
+
+    Tensor u_h_tmp;
+    Tensor ops_u_h_tmp;
+    u_h_tmp.mutable_data<T>(hidden_t.dims(), context.GetPlace());
+    ops_u_h_tmp.mutable_data<T>(hidden_t.dims(), context.GetPlace());
+    if (origin_mode) {
+        // compute output hidden_t = (gate[0] * hidden_prev_t + gate[2] - gate[0] * gate[2])
+        mpc_operator->mul(&gate_t[0], &hidden_prev_t, &u_h_tmp);
+        mpc_operator->add(&gate_t[2], &u_h_tmp, &u_h_tmp);
+        mpc_operator->mul(&gate_t[0], &gate_t[2], &ops_u_h_tmp);
+        mpc_operator->sub(&u_h_tmp, &ops_u_h_tmp, &hidden_t);
+    } else {
+        // compute output hidden_t = (gate[0] * gate[2] + hidden_prev_t - gate[0] * hidden_prev_t)
+        mpc_operator->mul(&gate_t[0], &gate_t[2], &u_h_tmp);
+        mpc_operator->add(&hidden_prev_t, &u_h_tmp, &u_h_tmp);
+        mpc_operator->mul(&gate_t[0], &hidden_prev_t, &ops_u_h_tmp);
+        mpc_operator->sub(&u_h_tmp, &ops_u_h_tmp, &hidden_t);
+    }
+}
+
+inline void SliceAndReshape(const Tensor* input, Tensor &output, int i) {
+    // Slice mpc tensor to share[i]
+    output = input->Slice(i, i + 1);
+    auto dims = output.dims();
+    output.Resize(paddle::framework::slice_ddim(dims, 1, dims.size()));
+}
+
+template<typename DeviceContext, typename T>
+inline void ToMpcBatchTensor(const framework::ExecutionContext& context,
+                             Tensor& output, const Tensor& input,
+                             int start, int end) {
+    //input : (2 , T, x) -> output: (2, end - start, x)
+    auto dims = input.dims();
+    auto& dev_ctx = context. template device_context<DeviceContext>();
+    math::Transpose<DeviceContext, T, 3> transpose;
+    Tensor tmp;
+    tmp.mutable_data<T>(framework::make_ddim({dims[1], dims[0], dims[2]}), context.GetPlace());
+    transpose(dev_ctx, input, &tmp, {1, 0, 2});
+    Tensor tmp_slice = tmp.Slice(start, end);
+    output.mutable_data<T>(framework::make_ddim({dims[0], end - start, dims[2]}), context.GetPlace());
+    transpose(dev_ctx, tmp_slice, &output, {1, 0, 2});
+}
+
+template<typename DeviceContext, typename T>
+inline void Split3Dim(const framework::ExecutionContext& context,
+                      std::vector<Tensor>& output,
+                      const Tensor& input) {
+    // input : (2, x, 3D) -> output : 3 * (2, x, D)
+    auto& dev_ctx = context. template device_context<DeviceContext>();
+    Tensor tmp_trans;
+    auto dims = input.dims();
+    int frame_size = dims[2] / 3;
+    tmp_trans.mutable_data<T>(framework::make_ddim({dims[2], dims[0], dims[1]}), context.GetPlace());
+    math::Transpose<DeviceContext, T, 3> transpose;
+    transpose(dev_ctx, input, &tmp_trans, {2, 0, 1});
+    for (int i = 0; i < 3; ++i) {
+        Tensor tmp_slice = tmp_trans.Slice(i * frame_size, (i + 1) * frame_size);
+        Tensor tmp_re_trans;
+        tmp_re_trans.mutable_data<T>(framework::make_ddim({dims[0], dims[1], dims[2] / 3}),
+                                      context.GetPlace());
+        transpose(dev_ctx, tmp_slice, &tmp_re_trans, {1, 2, 0});
+        output.emplace_back(tmp_re_trans);
+    }
+}
+
+
+template<typename DeviceContext, typename T>
+inline void Concat3Dim(const framework::ExecutionContext& context,
+                       Tensor* output,
+                       std::vector<Tensor>& input) {
+    // input 3 * (2, x, D) -> (2, x, 3D)
+    math::ConcatFunctor<DeviceContext, T> concat;
+    auto& input_dims = input[0].dims();
+    std::vector<int64_t> output_dim{input_dims[0], input_dims[1], input_dims[2] * 3};
+    output->mutable_data<T>(framework::make_ddim(output_dim), context.GetPlace());
+    auto& dev_ctx = context. template device_context<DeviceContext>();
+    concat(dev_ctx, input, 3, output);
+}
+
+template<typename DeviceContext, typename T>
+inline void SplitWeight(const framework::ExecutionContext& context,
+                      std::vector<Tensor>& splitted_weights,
+                      const Tensor& weight) {
+    // split weight[0]、weight[1]、weight[2] with shape (2, D, D) from weight(2, D, 3D)
+    // note that weight[2]'s data start at offset 2 * D * D of weight's data
+    auto& dev_ctx = context. template device_context<DeviceContext>();
+    auto dims = weight.dims();
+    auto frame_size = dims[2] / 3;
+    splitted_weights.resize(3);
+    auto place = context.GetPlace();
+
+    // copy weight[0] weight[1] from weight
+    Tensor update_weight;
+    update_weight.mutable_data<T>(framework::make_ddim({2, frame_size, 2 * frame_size}),
+                                      place);
+    //splitted_weights->at(2) = new Tensor();
+    splitted_weights[2].mutable_data<T>(framework::make_ddim({2, frame_size, frame_size}),
+                                             place);
+    for (int i = 0; i < 2; ++i) {
+        Tensor weight_s;
+        Tensor update_weight_s;
+        Tensor weight_3_s;
+        SliceAndReshape(&weight, weight_s, i);
+        SliceAndReshape(&update_weight, update_weight_s, i);
+        SliceAndReshape(&splitted_weights[2], weight_3_s, i);
+        T* update_s_data = update_weight_s.mutable_data<T>(place);
+        T* weight_s_data = weight_s.data<T>();
+        memcpy(update_s_data, weight_s_data, update_weight_s.numel() * sizeof(T));
+        // weight[3]
+        memcpy(weight_3_s.mutable_data<T>(place), weight_s_data + 2 * frame_size * frame_size,
+               weight_3_s.numel() * sizeof(T));
+    }
+    // split update_weight to weight[0] and weight[1]
+    math::Transpose<DeviceContext, T, 3> transpose;
+    Tensor weight_trans;
+    weight_trans.mutable_data<T>(framework::make_ddim({2 * frame_size, 2, frame_size}), place);
+    transpose(dev_ctx, update_weight, &weight_trans, {2, 0, 1});
+    for (int i = 0; i < 2; ++i) {
+        //splitted_weights->at(i) = new Tensor();
+        splitted_weights[i].mutable_data<T>(framework::make_ddim({2, frame_size, frame_size}), place);
+        transpose(dev_ctx, weight_trans.Slice(frame_size * i, frame_size * (i + 1)),
+                  &splitted_weights[i], {1, 2, 0});
+    }
+}
+
+template<typename DeviceContext, typename T>
+inline void ConcatWeight(const framework::ExecutionContext& context,
+                       Tensor* weight,
+                       std::vector<Tensor>& splitted_weights) {
+    // concat weight[0]、weight[1]、weight[2] with shape (2, D, D) to weight(2, D, 3D)
+    // note that weight[2]'s data append after weight[0] and weight[1]
+    // weight[0] and weight[1] are concat as shape (2, D, 2D) in axis 2
+    math::ConcatFunctor<DeviceContext, T> concat;
+    std::vector<Tensor> update_weight_list;
+    update_weight_list.resize(2);
+    auto place = context.GetPlace();
+    auto& splitted_weights_dims = splitted_weights[0].dims();
+    std::vector<int64_t> weight_dim{splitted_weights_dims[0], splitted_weights_dims[1],
+                                    splitted_weights_dims[2] * 3};
+    weight->mutable_data<T>(framework::make_ddim(weight_dim), context.GetPlace());
+    for (int i = 0; i < 2; ++i) {
+        update_weight_list[i] = splitted_weights[i];
+    }
+    auto& dev_ctx = context. template device_context<DeviceContext>();
+    // Concat update weight and reset weight as update weights
+    Tensor update_weights;
+    update_weights.mutable_data<T>(
+        framework::make_ddim({splitted_weights_dims[0],
+                splitted_weights_dims[1],
+                splitted_weights_dims[2] * 2}),
+        place);
+    concat(dev_ctx, update_weight_list, 3, &update_weights);
+    // Concat candidate weight
+    for (int i = 0; i < 2; ++i) {
+        Tensor weight_s = weight->Slice(i, i + 1);
+        Tensor update_weights_s = update_weights.Slice(i, i + 1);
+        Tensor reset_weight_s = splitted_weights[i].Slice(i, i + 1);
+
+        T* weight_s_data = weight_s.mutable_data<T>(place);
+        T* update_weights_s_data = update_weights_s.data<T>();
+        T* reset_weight_s_data = reset_weight_s.data<T>();
+
+        size_t numel_update = update_weights_s.numel();
+        memcpy(weight_s_data, update_weights_s_data, numel_update * sizeof(T));
+        memcpy(weight_s_data + numel_update, reset_weight_s_data, reset_weight_s.numel());
+    }
+}
+
+template<typename DeviceContext, typename T>
+inline void ConcatBatchOne(const framework::ExecutionContext& context,
+                           Tensor* output,
+                           Tensor& input,
+                           int start,
+                           int end) {
+    // replace output[2, start:end, x] with input (2, end - start, x)
+
+    auto& dev_ctx = context. template device_context<DeviceContext>();
+    Tensor tmp_trans;
+    auto dims = output->dims();
+    tmp_trans.mutable_data<T>(framework::make_ddim({dims[1], dims[0], dims[2]}), context.GetPlace());
+    math::Transpose<DeviceContext, T, 3> transpose;
+    transpose(dev_ctx, *output, &tmp_trans, {1, 0, 2});
+    Tensor splitted_t0;
+    Tensor splitted_t2;
+    Tensor splitted_t0_rec;
+    Tensor splitted_t2_rec;
+    std::vector<Tensor> concat_in;
+    if (start > 0) {
+        splitted_t0 = tmp_trans.Slice(0, start);
+        auto t0_dims = splitted_t0.dims();
+        splitted_t0_rec.mutable_data<T>(framework::make_ddim({t0_dims[1], t0_dims[0], t0_dims[2]}),
+                                    context.GetPlace());
+        transpose(dev_ctx, splitted_t0, &splitted_t0_rec, {1, 0, 2});
+        concat_in.emplace_back(splitted_t0_rec);
+    }
+    concat_in.emplace_back(input);
+    if (end < dims[1]) {
+        splitted_t2 = tmp_trans.Slice(end, dims[1]);
+        auto t2_dims = splitted_t2.dims();
+        splitted_t2_rec.mutable_data<T>(framework::make_ddim({t2_dims[1], t2_dims[0], t2_dims[2]}),
+                                    context.GetPlace());
+        transpose(dev_ctx, splitted_t2, &splitted_t2_rec, {1, 0, 2});
+        concat_in.emplace_back(splitted_t2_rec);
+    }
+
+    math::ConcatFunctor<DeviceContext, T> concat;
+    concat(dev_ctx, concat_in, 1, output);
+}
+
+template<typename DeviceContext, typename T>
+inline void ConcatBatchAll(const framework::ExecutionContext& context,
+                           Tensor* output,
+                           std::vector<Tensor>& input) {
+    // Concat all input tensors in dims[1]
+    math::ConcatFunctor<DeviceContext, T> concat;
+    auto& dev_ctx = context. template device_context<DeviceContext>();
+    concat(dev_ctx, input, 1, output);
+}
+
+template<typename DeviceContext, typename T>
+inline void GRUUnitGradCompute(const framework::ExecutionContext& context,
+                               std::vector<Tensor>& mpc_splitted_gate_t,
+                               std::vector<Tensor>& mpc_splitted_gate_grad_t,
+                               Tensor& mpc_hidden_prev_t, Tensor& mpc_hidden_prev_grad_t,
+                               std::vector<Tensor>& mpc_splitted_weights_t,
+                               std::vector<Tensor>& mpc_splitted_weights_grad_t,
+                               Tensor& mpc_reset_hidden_prev_t, Tensor& mpc_reset_hidden_prev_grad_t,
+                               Tensor& mpc_hidden_grad_t, bool origin_mode,
+                               bool& has_hidden_prev, bool& has_hidden_prev_grad,
+                               bool& has_weight_grad) {
+    // compute GRUUnitGrad
+    BackwardStateGrad<DeviceContext, T>(context,
+                         mpc_splitted_gate_t, mpc_splitted_gate_grad_t,
+                         mpc_hidden_prev_t, mpc_hidden_prev_grad_t,
+                         mpc_hidden_grad_t,
+                         origin_mode, has_hidden_prev, has_hidden_prev_grad);
+    PADDLE_ENFORCE_NOT_NULL(mpc::MpcInstance::mpc_protocol,
+                            "Protocol %s is not yet created in MPC Protocol.");
+    auto mpc_operator = mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators();
+    math::Transpose<DeviceContext, T, 3> transpose;
+    auto& dev_ctx = context. template device_context<DeviceContext>();
+    std::vector<int> trans_axis{0, 2, 1};
+    if (has_hidden_prev && has_hidden_prev_grad) {
+        auto res_hidden_dims = mpc_reset_hidden_prev_grad_t.dims();
+        // (B, D) * (D, D)^T + (B, D) :
+        //reset_hidden_prev_grad = batch_gate_grad[2] * state_weight[2] + reset_hidden_prev_grad
+        Tensor tmp;
+        tmp.mutable_data<T>(res_hidden_dims, context.GetPlace());
+
+        transpose(dev_ctx, mpc_splitted_weights_t[2], &tmp, trans_axis);
+        mpc_operator->matmul(&mpc_splitted_gate_t[2], &tmp, &tmp);
+        mpc_operator->add(&mpc_reset_hidden_prev_grad_t, &tmp, &mpc_reset_hidden_prev_grad_t);
+
+        if (has_weight_grad) {
+            // (B, D)^T * (B, D) + (D, D)
+            // state_weight_grad[2] = reset_hidden_prev * batch_gate_grad[2] + state_weight_grad[2]
+            Tensor tmp1, tmp2;
+            tmp1.mutable_data<T>(
+                framework::make_ddim(
+                    std::vector<int64_t>({res_hidden_dims[0], res_hidden_dims[2], res_hidden_dims[1]})),
+                context.GetPlace());
+            tmp2.mutable_data<T>(mpc_splitted_weights_t[2].dims(), context.GetPlace());
+            transpose(dev_ctx, mpc_reset_hidden_prev_t, &tmp1, trans_axis);
+            mpc_operator->matmul(&tmp1, &mpc_splitted_gate_grad_t[2], &tmp2);
+            mpc_operator->add(&mpc_splitted_weights_grad_t[2], &tmp2, &mpc_splitted_weights_grad_t[2]);
+        }
+    }
+    BackwarsResetGrad<DeviceContext, T>(context,
+                         mpc_splitted_gate_t, mpc_splitted_gate_grad_t,
+                         mpc_hidden_prev_t, mpc_hidden_prev_grad_t,
+                         mpc_reset_hidden_prev_grad_t,
+                         has_hidden_prev, has_hidden_prev_grad);
+    if (has_hidden_prev && has_hidden_prev_grad) {
+        // (B, 2D) * (D, 2D)^T + (B, D)
+        // hidden_prev_grad = batch_gate_grad * gate_weight + hidden_prev_grad
+        // block matrix multiplication: A=[block_A1, block_A2], B^T=[block_B1, block_B2]
+        // A*B = block_A1*block_B1 + block_A2*block_B2
+        Tensor tmp1, tmp2;
+        tmp1.mutable_data<T>(mpc_splitted_weights_t[0].dims(), context.GetPlace());
+        tmp2.mutable_data<T>(mpc_hidden_prev_t.dims(), context.GetPlace());
+        transpose(dev_ctx, mpc_splitted_weights_t[0], &tmp1, trans_axis);
+        mpc_operator->matmul(&mpc_splitted_gate_grad_t[0], &tmp1, &tmp2);
+        mpc_operator->add(&mpc_hidden_prev_grad_t, &tmp2, &mpc_hidden_prev_grad_t);
+
+        transpose(dev_ctx, mpc_splitted_weights_t[1], &tmp1, trans_axis);
+        mpc_operator->matmul(&mpc_splitted_gate_grad_t[1], &tmp1, &tmp2);
+        mpc_operator->add(&mpc_hidden_prev_grad_t, &tmp2, &mpc_hidden_prev_grad_t);
+
+        if (has_weight_grad) {
+            // (B, D)^T * (B, 2D) + (D, 2D)
+            // gate_weight_grad = hidden_prev * batch_gate_grad + gate_weight_grad
+            auto hid_dims = mpc_hidden_prev_t.dims();
+            Tensor tmp3, tmp4;
+            tmp3.mutable_data<T>(
+                framework::make_ddim({hid_dims[0], hid_dims[2], hid_dims[1]}),
+                context.GetPlace());
+            tmp4.mutable_data<T>(mpc_splitted_weights_t[0].dims(), context.GetPlace());
+            transpose(dev_ctx, mpc_hidden_prev_t, &tmp3, trans_axis);
+            mpc_operator->matmul(&tmp3, &mpc_splitted_gate_grad_t[0], &tmp4);
+            mpc_operator->add(&mpc_splitted_weights_grad_t[0], &tmp4, &mpc_splitted_weights_grad_t[0]);
+
+            mpc_operator->matmul(&tmp3, &mpc_splitted_gate_grad_t[1], &tmp4);
+            mpc_operator->add(&mpc_splitted_weights_grad_t[1], &tmp4, &mpc_splitted_weights_grad_t[1]);
+        }
+    }
+}
+
+template<typename DeviceContext, typename T>
+inline void BackwardStateGrad(const framework::ExecutionContext& context,
+                              std::vector<Tensor>& mpc_splitted_gate_t,
+                              std::vector<Tensor>& mpc_splitted_gate_grad_t,
+                              Tensor& mpc_hidden_prev_t, Tensor& mpc_hidden_prev_grad_t,
+                              Tensor& mpc_hidden_grad_t,
+                              bool origin_mode, bool has_hidden_prev,
+                              bool has_hidden_prev_grad) {
+    PADDLE_ENFORCE_NOT_NULL(mpc::MpcInstance::mpc_protocol,
+                            "Protocol %s is not yet created in MPC Protocol.");
+    auto mpc_operator = mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators();
+    math::SetConstant<DeviceContext, T> zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (!has_hidden_prev) {
+        zero(dev_ctx, &mpc_hidden_prev_t, static_cast<T>(0));
+    }
+    if (!has_hidden_prev_grad) {
+        zero(dev_ctx, &mpc_hidden_prev_grad_t, static_cast<T>(0));
+    }
+
+    if (origin_mode) {
+        // batch_gate_grad[0] = hidden_grad * (hidden_prev - batch_gate[2])
+        mpc_operator->sub(&mpc_hidden_prev_t, &mpc_splitted_gate_t[2], &mpc_splitted_gate_grad_t[0]);
+        mpc_operator->mul(&mpc_hidden_grad_t, &mpc_splitted_gate_grad_t[0], &mpc_splitted_gate_grad_t[0]);
+        // hidden_prev_grad += hidden_grad * batch_gate[0]
+        Tensor tmp;
+        tmp.mutable_data<T>(mpc_hidden_prev_grad_t.dims(), context.GetPlace());
+        mpc_operator->mul(&mpc_hidden_grad_t, &mpc_splitted_gate_t[0], &tmp);
+        mpc_operator->add(&mpc_hidden_prev_grad_t, &tmp, &mpc_hidden_prev_grad_t);
+
+        // batch_gate_grad[2] = activation(hidden_grad * (1-batch_gate[0]), batch_gate[2])
+        // activation = grad_relu (return a * (b > 0.0 ? 1.0 : 0.0);)
+        Tensor tmp1;
+        tmp1.mutable_data<T>(mpc_splitted_gate_grad_t[2].dims(), context.GetPlace());
+        mpc_operator->mul(&mpc_hidden_grad_t, &mpc_splitted_gate_t[0], &tmp1);
+        mpc_operator->sub(&mpc_hidden_grad_t, &tmp1, &tmp1);
+        mpc_operator->relu_grad(&mpc_splitted_gate_t[2], &tmp1, &mpc_splitted_gate_grad_t[2], 0);
+
+    } else {
+        // batch_gate_grad[0] = hidden_grad * (batch_gate[2] - hidden_prev)
+        mpc_operator->sub(&mpc_splitted_gate_t[2], &mpc_hidden_prev_t, &mpc_splitted_gate_grad_t[0]);
+        mpc_operator->mul(&mpc_hidden_grad_t, &mpc_splitted_gate_grad_t[0], &mpc_splitted_gate_grad_t[0]);
+        // hidden_prev_grad += hidden_grad * (1 - batch_gate[0])
+        Tensor tmp;
+        tmp.mutable_data<T>(mpc_hidden_prev_grad_t.dims(), context.GetPlace());
+        mpc_operator->mul(&mpc_hidden_grad_t, &mpc_splitted_gate_t[0], &tmp);
+        mpc_operator->sub(&mpc_hidden_grad_t, &tmp, &tmp);
+        mpc_operator->add(&mpc_hidden_prev_grad_t, &tmp, &mpc_hidden_prev_grad_t);
+
+        // batch_gate_grad[2] = activation(hidden_grad*batch_gate[0], batch_gate[2])
+        // activation = grad_relu
+        Tensor tmp1;
+        tmp1.mutable_data<T>(mpc_splitted_gate_grad_t[2].dims(), context.GetPlace());
+        mpc_operator->mul(&mpc_hidden_grad_t, &mpc_splitted_gate_t[0], &tmp1);
+        mpc_operator->relu_grad(&mpc_splitted_gate_t[2], &tmp1, &mpc_splitted_gate_grad_t[2], 0);
+    }
+}
+
+template<typename DeviceContext, typename T>
+inline void BackwarsResetGrad(const framework::ExecutionContext& context,
+                              std::vector<Tensor>& mpc_splitted_gate_t,
+                              std::vector<Tensor>& mpc_splitted_gate_grad_t,
+                              Tensor& mpc_hidden_prev_t, Tensor& mpc_hidden_prev_grad_t,
+                              Tensor& mpc_reset_hidden_prev_grad_t,
+                              bool has_hidden_prev, bool has_hidden_prev_grad) {
+    PADDLE_ENFORCE_NOT_NULL(mpc::MpcInstance::mpc_protocol,
+                            "Protocol %s is not yet created in MPC Protocol.");
+    auto mpc_operator = mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators();
+    math::SetConstant<DeviceContext, T> zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (!has_hidden_prev) {
+        zero(dev_ctx, &mpc_hidden_prev_t, static_cast<T>(0));
+    }
+    if (!has_hidden_prev_grad) {
+        zero(dev_ctx, &mpc_hidden_prev_grad_t, static_cast<T>(0));
+    }
+    if (!has_hidden_prev || !has_hidden_prev_grad) {
+        zero(dev_ctx, &mpc_reset_hidden_prev_grad_t, static_cast<T>(0));
+    }
+    // batch_gate_grad[1] = reset_hidden_grad * hidden_prev
+    mpc_operator->mul(&mpc_reset_hidden_prev_grad_t, &mpc_hidden_prev_t, &mpc_splitted_gate_grad_t[1]);
+    // hidden_prev_grad += reset_hidden_grad * batch_gate_grad[1]
+    Tensor tmp;
+    tmp.mutable_data<T>(mpc_hidden_prev_grad_t.dims(), context.GetPlace());
+    mpc_operator->mul(&mpc_reset_hidden_prev_grad_t, &mpc_splitted_gate_grad_t[1], &tmp);
+    mpc_operator->add(&mpc_hidden_prev_grad_t, &tmp, &mpc_hidden_prev_grad_t);
+    // batch_gate_grad[0] = sigmoid_grad(batch_gate_grad[0], batch_gate[0])
+    ComputeSigmoidGrad<T>(context, mpc_splitted_gate_grad_t[0],
+                          mpc_splitted_gate_t[0], mpc_splitted_gate_grad_t[0]);
+    // batch_gate_grad[1] = sigmoid_grad(batch_gate_grad[1], batch_gate[1])
+    ComputeSigmoidGrad<T>(context, mpc_splitted_gate_grad_t[1],
+                          mpc_splitted_gate_t[1], mpc_splitted_gate_grad_t[1]);
+}
+
+template<typename T>
+inline void ComputeSigmoidGrad(const framework::ExecutionContext& context,
+                               Tensor& dy, Tensor& y, Tensor& dx) {
+    // dx = dy * (1.0 - y * y);
+    PADDLE_ENFORCE_NOT_NULL(mpc::MpcInstance::mpc_protocol,
+                            "Protocol %s is not yet created in MPC Protocol.");
+    auto mpc_operator = mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators();
+    Tensor tmp;
+    tmp.mutable_data<T>(dx.dims(), context.GetPlace());
+    mpc_operator->mul(&y, &y, &tmp);
+    mpc_operator->mul(&dy, &tmp, &tmp);
+    mpc_operator->sub(&dy, &tmp, &dx);
+}
+
+template <typename DeviceContext, typename T>
+class MpcGRUGradKernel : public MpcOpKernel<T> {
+public:
+    void BatchCompute(const framework::ExecutionContext& context) const {
+        bool origin_mode = context.Attr<bool>("origin_mode");
+        auto* h0 = context.Input<Tensor>("H0");
+        auto* weight = context.Input<Tensor>("Weight");
+        const T* weight_data = weight->data<T>();
+        auto* batch_gate = context.Input<LoDTensor>("BatchGate");
+        auto* batch_reset_hidden_prev =
+            context.Input<LoDTensor>("BatchResetHiddenPrev");
+        auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
+        auto* hidden = context.Input<LoDTensor>("Hidden");
+        auto* hidden_grad =
+            context.Input<LoDTensor>(framework::GradVarName("Hidden"));
+        auto* input_grad =
+            context.Output<LoDTensor>(framework::GradVarName("Input"));
+        auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
+        auto* weight_grad =
+            context.Output<Tensor>(framework::GradVarName("Weight"));
+        auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+
+        auto gate_dims = batch_gate->dims();
+        auto hidden_dims = hidden->dims();
+        auto gate_lod = batch_gate->lod();
+        const auto& place = context.GetPlace();
+        bool has_hidden_prev = false;
+        bool has_hidden_prev_grad = false;
+        bool has_weight_grad = false;
+
+        math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+        LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
+        batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
+        batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
+        batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
+                context.GetPlace());
+        math::SetConstant<DeviceContext, T> zero;
+        auto& dev_ctx = context.template device_context<DeviceContext>();
+        zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0));
+        zero(dev_ctx, &batch_gate_grad, static_cast<T>(0));
+        zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0));
+
+        Tensor ordered_h0, ordered_h0_grad;
+
+        framework::Vector<size_t> order(gate_lod[2]);
+
+        if (h0) {
+            // Reorder mpc h0
+            ordered_h0.mutable_data<T>(h0->dims(), place);
+            for (int i = 0; i < 2; ++i) {
+                Tensor h0_s;
+                SliceAndReshape(h0, h0_s, i);
+                Tensor ordered_h0_s;
+                SliceAndReshape(&ordered_h0, ordered_h0_s, i);
+                ReorderInitState<DeviceContext, T>(dev_ctx, h0_s, order, &ordered_h0_s,
+                                                   true);
+            }
+        }
+        if (h0_grad) {
+            ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
+            zero(context.template device_context<DeviceContext>(), &ordered_h0_grad,
+                 static_cast<T>(0));
+        }
+
+        bool is_reverse = context.Attr<bool>("is_reverse");
+        for (int i = 0; i < 2; ++i) {
+            // mpc LoDTensor to mpc batch
+            Tensor batch_hidden_grad_s;
+            SliceAndReshape(&batch_hidden_grad, batch_hidden_grad_s, i);
+            Tensor hidden_grad_s;
+            SliceAndReshape(hidden_grad, hidden_grad_s, i);
+            LoDTensor lod_batch_hidden_grad_s;
+            LoDTensor lod_hidden_grad_s;
+            lod_batch_hidden_grad_s.ShareBufferWith(batch_hidden_grad_s);
+            lod_batch_hidden_grad_s.mutable_data<T>(batch_hidden_grad_s.dims(), place);
+            lod_hidden_grad_s.ShareBufferWith(hidden_grad_s);
+            lod_hidden_grad_s.mutable_data<T>(hidden_grad_s.dims(), place);
+            lod_hidden_grad_s.set_lod(gate_lod);
+            lod_batch_hidden_grad_s.set_lod(gate_lod);
+            to_batch(dev_ctx, lod_hidden_grad_s, &lod_batch_hidden_grad_s, false, is_reverse);
+        }
+        if (weight_grad) {
+            T* gate_weight_grad =
+                weight_grad->mutable_data<T>(context.GetPlace());
+            zero(dev_ctx, weight_grad, static_cast<T>(0));
+            has_weight_grad = true;
+        }
+        // split weights
+        std::vector<Tensor> mpc_splitted_weights_t;
+        SplitWeight<DeviceContext, T>(context, mpc_splitted_weights_t, *weight);
+
+        auto batch_starts = gate_lod[0];
+        size_t num_batch = batch_starts.size() - 1;
+        for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+            int bstart = static_cast<int>(batch_starts[n]);
+            int bend = static_cast<int>(batch_starts[n + 1]);
+            int cur_batch_size = bend - bstart;
+            int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+
+            // Split mpc tensors
+            Tensor mpc_hidden_grad_t;
+            Tensor mpc_hidden_prev_t;
+            Tensor mpc_hidden_prev_grad_t;
+            Tensor mpc_reset_hidden_prev_t;
+            Tensor mpc_reset_hidden_prev_grad_t;
+            std::vector<Tensor> splitted_batch_gate_t;
+            std::vector<Tensor> mpc_splitted_gate_t;
+            std::vector<Tensor> splitted_batch_gate_grad_t;
+            std::vector<Tensor> mpc_splitted_gate_grad_t;
+            std::vector<Tensor> mpc_splitted_weights_grad_t;
+
+            if (weight_grad) {
+                SplitWeight<DeviceContext, T>(context, mpc_splitted_weights_grad_t, *weight_grad);
+            }
+            ToMpcBatchTensor<DeviceContext, T>(context, mpc_hidden_grad_t, batch_hidden_grad, bstart, bend);
+            ToMpcBatchTensor<DeviceContext, T>(context, mpc_reset_hidden_prev_t, *batch_reset_hidden_prev, bstart, bend);
+            ToMpcBatchTensor<DeviceContext, T>(context, mpc_reset_hidden_prev_grad_t,
+                                               batch_reset_hidden_prev_grad, bstart, bend);
+
+            Split3Dim<DeviceContext, T>(context, splitted_batch_gate_grad_t, batch_gate_grad);
+            Split3Dim<DeviceContext, T>(context, splitted_batch_gate_t, *batch_gate);
+            for (int i = 0; i < 3; ++i) {
+                ToMpcBatchTensor<DeviceContext, T>(context, mpc_splitted_gate_grad_t[i],
+                                                   splitted_batch_gate_grad_t[i], bstart, bend);
+                ToMpcBatchTensor<DeviceContext, T>(context, mpc_splitted_gate_t[i],
+                                                   splitted_batch_gate_t[i], bstart, bend);
+            }
+            if (n == 0) {
+                if (h0) {
+                    // hidden_prev_t = ordered_h0
+                    mpc_hidden_prev_t.mutable_data<T>(
+                                             ordered_h0.dims(), place);
+                    framework::TensorCopy(ordered_h0, place, &mpc_hidden_prev_t);
+                    has_hidden_prev = true;
+                    if (h0_grad) {
+                        // hidden_prev_grad_t = ordered_h0_grad
+                        mpc_hidden_prev_grad_t.mutable_data<T>(
+                                                      ordered_h0_grad.dims(), place);
+                        framework::TensorCopy(ordered_h0_grad, place, &mpc_hidden_prev_grad_t);
+                        has_hidden_prev_grad = true;
+                    }
+                }
+            } else {
+                ToMpcBatchTensor<DeviceContext, T>(context, mpc_hidden_prev_t, *batch_hidden, bstart_pre, bstart);
+                ToMpcBatchTensor<DeviceContext, T>(context, mpc_hidden_prev_grad_t, batch_hidden_grad, bstart_pre, bstart);
+
+            }
+            // compute GRUUnitGrad
+            GRUUnitGradCompute<DeviceContext, T>(context,
+                                                 mpc_splitted_gate_t, mpc_splitted_gate_grad_t,
+                                                 mpc_hidden_prev_t, mpc_hidden_prev_grad_t,
+                                                 mpc_splitted_weights_t, mpc_splitted_weights_grad_t,
+                                                 mpc_reset_hidden_prev_t, mpc_reset_hidden_prev_grad_t,
+                                                 mpc_hidden_grad_t, origin_mode, has_hidden_prev,
+                                                 has_hidden_prev_grad, has_weight_grad);
+            // cancat mpc tensor to gru_grad output variables
+            if (weight_grad) {
+                ConcatWeight<DeviceContext, T>(context, weight_grad, mpc_splitted_weights_grad_t);
+            }
+            Tensor mpc_batch_gate_grad_t;
+            Concat3Dim<DeviceContext, T>(context, &mpc_batch_gate_grad_t, mpc_splitted_gate_grad_t);
+            ConcatBatchOne<DeviceContext, T>(context, &batch_gate_grad, mpc_batch_gate_grad_t, bstart, bend);
+            ConcatBatchOne<DeviceContext, T>(context, &batch_hidden_grad, mpc_hidden_prev_grad_t, bstart_pre, bstart);
+            ConcatBatchOne<DeviceContext, T>(context, &batch_reset_hidden_prev_grad, mpc_reset_hidden_prev_grad_t, bstart, bend);
+        }
+        if (input_grad) {
+            // batch to lodTensor for mpc input_grad
+            input_grad->mutable_data<T>(context.GetPlace());
+            math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+            batch_gate_grad.set_lod(gate_lod);
+            for (int i = 0; i < 2; ++i) {
+                Tensor batch_gate_grad_s;
+                SliceAndReshape(&batch_gate_grad, batch_gate_grad_s, i);
+                Tensor input_grad_s;
+                SliceAndReshape(input_grad, input_grad_s, i);
+
+                LoDTensor lod_batch_gate_grad_s;
+                LoDTensor lod_input_grad_s;
+                lod_batch_gate_grad_s.ShareBufferWith(batch_gate_grad_s);
+                lod_batch_gate_grad_s.mutable_data<T>(batch_gate_grad_s.dims(), place);
+                lod_batch_gate_grad_s.set_lod(gate_lod);
+                lod_input_grad_s.ShareBufferWith(input_grad_s);
+                lod_input_grad_s.mutable_data<T>(input_grad_s.dims(), place);
+                to_seq(dev_ctx, lod_batch_gate_grad_s, &lod_input_grad_s);
+            }
+        }
+        if (bias_grad) {
+            // col_sum mpc bias_grad
+            bias_grad->mutable_data<T>(context.GetPlace());
+            math::ColwiseSum<DeviceContext, T> col_sum;
+            for (int i = 0; i < 2; ++i) {
+                Tensor batch_gate_grad_s;
+                SliceAndReshape(&batch_gate_grad, batch_gate_grad_s, i);
+                Tensor bias_grad_s;
+                SliceAndReshape(bias_grad, bias_grad_s, i);
+                col_sum(dev_ctx, batch_gate_grad_s, &bias_grad_s);
+            }
+        }
+        if (h0 && h0_grad) {
+            // Reorder mpc h0_grad
+            for (int i = 0; i < 2; ++i) {
+                Tensor ordered_h0_grad_s;
+                SliceAndReshape(&ordered_h0_grad, ordered_h0_grad_s, i);
+                Tensor h0_grad_s;
+                SliceAndReshape(h0_grad, h0_grad_s, i);
+                ReorderInitState<DeviceContext, T>(dev_ctx, ordered_h0_grad_s, order,
+                                                   &h0_grad_s, false);
+            }
+        }
+    }
+
+    void ComputeImpl(const framework::ExecutionContext& context) const override {
+        BatchCompute(context);
+    }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+
--- a/core/paddlefl_mpc/operators/mpc_lookup_table_v2_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_lookup_table_v2_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include "mpc_lookup_table_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MpcLookupTableV2Op : public framework::OperatorWithKernel {
+public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+
+    void InferShape(framework::InferShapeContext* ctx) const override {
+        PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
+                          "Input(W) of LookupTableV2Op should not be null.");
+        PADDLE_ENFORCE_EQ(ctx->HasInput("Ids"), true,
+                          "Input(Ids) of LookupTableV2Op should not be null.");
+        PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                          "Output(Out) of LookupTableV2Op should not be null.");
+
+        auto table_dims = ctx->GetInputDim("W");
+        auto ids_dims = ctx->GetInputDim("Ids");
+        int ids_rank = ids_dims.size();
+        VLOG(5) << "ids rank is " << ids_rank << std::endl;
+        PADDLE_ENFORCE_EQ(
+            table_dims.size(), 3,
+            "ShapeError: The dimensions of the 'mpc lookup table' must be 3. "
+            "But received lookup table's dimensions = %d, "
+            "lookup table's shape = [%s].",
+            table_dims.size(), table_dims);
+        PADDLE_ENFORCE_EQ(
+            ids_dims.size(), 3,
+            "ShapeError: The dimensions of the 'idexes' must be 3, "
+            "Other dimensions are not supported temporarily. "
+            "Received idexes' dimensions = %d, "
+            "idexes's shape = [%s].",
+            table_dims.size(), table_dims);
+        PADDLE_ENFORCE_EQ(
+            table_dims[0], 2,
+            "ShapeError: The first dimensions of the 'mpc lookup table' must be 2. "
+            "But received lookup table's first dimensions = %d.",
+            table_dims[0]);
+        PADDLE_ENFORCE_EQ(
+            ids_dims[0], 2,
+            "ShapeError: The first dimensions of the 'indexes' must be 2. "
+            "But received indexes' first dimensions = %d.",
+            ids_dims[0]);
+
+        auto output_dims = framework::vectorize(ids_dims);
+        output_dims[output_dims.size() - 1] = table_dims[2];
+        auto out_dims = framework::make_ddim(output_dims);
+        ctx->SetOutputDim("Out", out_dims);
+
+        if (ctx->GetOutputsVarType("Out")[0] ==
+                framework::proto::VarType::LOD_TENSOR) {
+            ctx->ShareLoD("Ids", /*->*/ "Out");
+        }
+    }
+
+protected:
+    framework::OpKernelType GetExpectedKernelType(
+        const framework::ExecutionContext& ctx) const override {
+        auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "W");
+        return framework::OpKernelType(data_type, ctx.device_context());
+    }
+};
+
+class MpcLookupTableV2OpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+    void Make() override {
+        AddInput("W",
+                 "(Tensor) The input represents embedding tensors, "
+                 "which is a learnable parameter.");
+        AddInput("Ids",
+                 "An input with type int64 "
+                 "contains the ids to be looked up in W.");
+        AddOutput("Out", "The lookup results, which have the same type as W.");
+        AddAttr<bool>("is_sparse",
+                      "(boolean, default false) "
+                      "Sparse update.")
+        .SetDefault(false);
+        AddAttr<bool>("is_distributed",
+                      "(boolean, default false) distributed lookup table.")
+        .SetDefault(false);
+        AddAttr<int64_t>("padding_idx",
+                         "(int64, default -1) "
+                         "If the value is -1, it makes no effect to lookup. "
+                         "Otherwise the given value indicates padding the output "
+                         "with zeros whenever lookup encounters it in Ids.")
+        .SetDefault(kNoPadding);
+
+        // for parameter prefetch
+        AddAttr<bool>("remote_prefetch", "").SetDefault(false);
+        AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+        AddAttr<std::vector<int64_t>>("height_sections",
+                                      "Height for each output SelectedRows.")
+                                   .SetDefault(std::vector<int64_t>({}));
+        AddAttr<std::vector<std::string>>(
+                                           "epmap",
+                                           "(string vector, default 127.0.0.1:6164)"
+                                           "Server endpoints in the order of input variables for mapping")
+                                       .SetDefault({});
+        AddAttr<std::vector<std::string>>(
+                                           "table_names",
+                                           "(string vector, the splited table names that will be fetched from "
+                                           "parameter server)"
+                                           "in the order of input variables for mapping")
+                                       .SetDefault({});
+
+        AddComment(R"DOC(
+Lookup Table V2 Operator.
+
+This operator is used to perform lookups on the parameter W,
+then concatenated into a dense tensor.
+
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
+
+)DOC");
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(MpcLookupTableV2GradOpNoBuffer, "W");
+
+template <typename T>
+class MpcLookupTableV2GradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("mpc_lookup_table_v2_grad");
+
+    op->SetInput("W", this->Input("W"));
+    op->SetInput("Ids", this->Input("Ids"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+
+    op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));
+
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+class MpcLookupTableV2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto table_dims = ctx->GetInputDim("W");
+    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class MpcLookupTableV2OpGradVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto out_var_name = framework::GradVarName("W");
+    auto attr = ctx->GetAttr("is_sparse");
+    bool is_sparse = boost::get<bool>(attr);
+    if (is_sparse) {
+      VLOG(3) << "mpc_lookup_table_v2_grad op " << framework::GradVarName("W")
+              << " is set to SelectedRows";
+      ctx->SetOutputType(out_var_name,
+                         framework::proto::VarType::SELECTED_ROWS);
+    } else {
+      VLOG(3) << "mpc_lookup_table_v2_grad op " << framework::GradVarName("W")
+              << " is set to LoDTensor";
+      ctx->SetOutputType(out_var_name, framework::proto::VarType::LOD_TENSOR);
+    }
+    ctx->SetOutputDataType(out_var_name, ctx->GetInputDataType("W"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mpc_lookup_table_v2, ops::MpcLookupTableV2Op,
+                  ops::MpcLookupTableV2OpMaker,
+                  ops::MpcLookupTableV2GradOpMaker<paddle::framework::OpDesc>,
+                  ops::MpcLookupTableV2GradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(mpc_lookup_table_v2_grad, ops::MpcLookupTableV2OpGrad,
+                  ops::MpcLookupTableV2GradOpNoBuffer,
+                  ops::MpcLookupTableV2OpGradVarTypeInference);
+
+REGISTER_OP_CPU_KERNEL(mpc_lookup_table_v2, ops::MpcLookupTableV2Kernel<int64_t>);
+REGISTER_OP_CPU_KERNEL(mpc_lookup_table_v2_grad,
+                       ops::MpcLookupTableV2GradKernel<int64_t>);
+
--- a/core/paddlefl_mpc/operators/mpc_lookup_table_v2_op.h
+++ b/core/paddlefl_mpc/operators/mpc_lookup_table_v2_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "mpc_op.h"
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "core/paddlefl_mpc/operators/math/math_function_impl.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+constexpr int64_t kNoPadding = -1;
+
+template <typename T>
+class MpcLookupTableV2Kernel : public MpcOpKernel<T> {
+public:
+    void ComputeImpl(const framework::ExecutionContext &context) const override {
+        auto *ids_t = context.Input<Tensor>("Ids");      // int tensor
+        auto *output_t = context.Output<Tensor>("Out");  // float tensor
+        auto *table_var = context.Input<Tensor>("W");
+        auto *ids = ids_t->data<T>();
+        auto *table = table_var->data<T>();
+        auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+        PADDLE_ENFORCE_NOT_NULL(mpc::MpcInstance::mpc_protocol,
+                                "Protocol %s is not yet created in MPC Protocol.");
+        mpc::MpcInstance::mpc_instance()->mpc_protocol()->
+        mpc_operators()->matmul(ids_t, table_var, output_t);
+    }
+};
+
+template <typename T>
+class MpcLookupTableV2GradKernel : public MpcOpKernel<T> {
+public:
+    void ComputeImpl(const framework::ExecutionContext &context) const override {
+        auto *ids_t = context.Input<Tensor>("Ids");
+        auto id_dim = ids_t->dims();
+        auto col_width = id_dim[1];
+        auto row_width = id_dim[2];
+        auto *d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
+        auto *d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
+
+        // transpose ids_t
+        auto *ids = ids_t->data<T>();
+        auto *table = d_table_t->mutable_data<T>(context.GetPlace());
+        auto *output = d_output_t->data<T>();
+
+        Tensor ids_trans_t;
+        auto *ids_trans = ids_trans_t.mutable_data<T>({2, row_width, col_width}, context.GetPlace());
+
+        math::Transpose<platform::CPUDeviceContext, T, 3> transpose;
+        auto& dev_ctx = context. template device_context<platform::CPUDeviceContext>();
+        transpose(dev_ctx, *ids_t, &ids_trans_t, {0, 2, 1});
+        PADDLE_ENFORCE_NOT_NULL(mpc::MpcInstance::mpc_protocol, "Protocol %s is not yet created in MPC Protocol.");
+        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->matmul(&ids_trans_t, d_output_t, d_table_t);
+    }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
--- a/core/paddlefl_mpc/operators/mpc_mul_op.h
+++ b/core/paddlefl_mpc/operators/mpc_mul_op.h
@@ -150,6 +150,7 @@ public:

        if (dx) {
            dx->mutable_data<T>(ctx.GetPlace());
+            auto dx_dim = dx->dims();
            if (dx->dims().size() > 3) {
                dx->Resize({2, x_mat_width, x_mat_height});
            }
@@ -160,7 +161,6 @@ public:
            // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
            mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->matmul(
                &dout_matrix, &y_matrix_trans, dx);  
-            auto dx_dim = dx->dims();
            if (dx_dim.size() > 3) {
                dx->Resize(dx_dim);
            }
@@ -168,6 +168,7 @@ public:

        if (dy) {
            dy->mutable_data<T>(ctx.GetPlace());
+            auto dy_dim = dy->dims();
            if (dy->dims().size() > 3) {
                dy->Resize({2, y_mat_width, y_mat_height});
            }
@@ -179,7 +180,6 @@ public:
            // dy = x' * dout. dy K x N, dout : M x N, x : M x K
            mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->matmul(
                &x_matrix_trans, &dout_matrix, dy);  
-            auto dy_dim = dy->dims();
            if (dy_dim.size() > 3) {
                dy->Resize(dy_dim);
            }

--- a/core/paddlefl_mpc/operators/mpc_pool_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_pool_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include <unordered_map>
+#include "mpc_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+int PoolOutputSize(int input_size, int filter_size, int padding_1,
+                   int padding_2, int stride, bool ceil_mode) {
+    int output_size;
+    if (!ceil_mode) {
+        output_size = (input_size - filter_size + padding_1 + padding_2) / stride + 1;
+    } else {
+        output_size = (input_size - filter_size + padding_1 + padding_2 + stride - 1) / stride + 1;
+    }
+    PADDLE_ENFORCE_GT(
+        output_size, 0,
+        "ShapeError: the output size must be greater than 0. But received: "
+        "output_size = %d due to the settings of input_size(%d), padding(%d,%d), "
+        "k_size(%d) and stride(%d). Please check again!",
+        output_size, input_size, padding_1, padding_2, filter_size, stride);
+    return output_size;
+}
+
+
+class MpcPoolOp : public framework::OperatorWithKernel {
+public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+
+    void InferShape(framework::InferShapeContext* ctx) const override{
+        PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                          "X(Input) of Pooling should not be null.");
+        PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                          "Out(Output) of Pooling should not be null.");
+
+        std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
+        std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+        std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+        std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+        bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
+        // bool adaptive = ctx->Attrs().Get<bool>("adaptive");
+        bool global_pooling = ctx->Attrs().Get<bool>("global_pooling");
+        std::string data_format = ctx->Attrs().Get<std::string>("data_format");
+        std::string padding_algorithm = ctx->Attrs().Get<std::string>("padding_algorithm");
+
+        auto in_x_dims = ctx->GetInputDim("X");
+        PADDLE_ENFORCE_EQ(in_x_dims.size(), 5,
+            "ShapeError: the input of Op(pool) should be 5-D Tensor (ciphertext). "
+            "But received: %u-D Tensor and it's shape is [%s].",
+            in_x_dims.size(), in_x_dims);
+
+        PADDLE_ENFORCE_EQ(in_x_dims.size() - ksize.size(), 3U,
+            "ShapeError: the dimension of input(ciphertext) minus the size of "
+            "Attr(ksize)(plaintext) must be euqal to 3 in Op(pool). "
+            "But received: the dimension of input minus the size "
+            "of Attr(ksize) is %d, the "
+            "input's dimension is %d, the shape of input "
+            "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
+            in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
+            ksize.size(), framework::make_ddim(ksize));
+
+        PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                          "ShapeError: the size of Attr(ksize) and Attr(strides) in "
+                          "Op(pool) must be equal. "
+                          "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
+                          "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
+                          ksize.size(), strides.size(), framework::make_ddim(ksize),
+                          framework::make_ddim(strides));
+
+        PADDLE_ENFORCE_EQ(data_format, "NCHW", 
+            "data format can only be 'NCHW' ", 
+            in_x_dims.size(), in_x_dims);
+
+        // update paddings if "SAME" or global_pooling
+        framework::DDim data_dims;
+        data_dims = framework::slice_ddim(in_x_dims, 3, in_x_dims.size());
+        UpdatePadding(&paddings, global_pooling, padding_algorithm,
+                      data_dims, strides, ksize);
+
+        if (global_pooling) {
+            UpdateKsize(&ksize, data_dims);
+        }
+
+        std::vector<int64_t> output_shape;
+        std::vector<int64_t> one_hot_tensor_shape;
+        for (int i = 0; i < data_dims.size(); ++i) {
+            if ((!ctx->IsRuntime()) && (data_dims[i] < 0)) {
+                output_shape.push_back(data_dims[i]);
+            } else {
+                output_shape.push_back(
+                PoolOutputSize(data_dims[i], ksize[i], paddings[2 * i],
+                               paddings[2 * i + 1], strides[i], ceil_mode));
+            }
+        }
+
+        output_shape.insert(output_shape.begin(), in_x_dims[0]); // share size
+        output_shape.insert(output_shape.begin() + 1, in_x_dims[1]); // output_N = input_N
+        output_shape.insert(output_shape.begin() + 2, in_x_dims[2]); // output_C = input_C
+
+        one_hot_tensor_shape.push_back(in_x_dims[0]); // share size
+        one_hot_tensor_shape.push_back(in_x_dims[1]); // input_N
+        one_hot_tensor_shape.push_back(in_x_dims[2]); // input_C
+        one_hot_tensor_shape.push_back(ksize[0] * ksize[1]);
+        one_hot_tensor_shape.push_back(output_shape[3] * output_shape[4]);
+
+        ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+        ctx->ShareLoD("X", "Out");
+
+        ctx->SetOutputDim("One_hot_tensor", framework::make_ddim(one_hot_tensor_shape));
+        ctx->ShareLoD("X", "One_hot_tensor");
+    }
+
+protected:
+    framework::OpKernelType GetExpectedKernelType(const framework::ExecutionContext& ctx) const {
+        framework::LibraryType library_{framework::LibraryType::kPlain};
+        std::string data_format = "AnyLayout";
+        framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
+        return framework::OpKernelType(
+            OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(),
+            layout_, library_);
+    }
+
+    framework::OpKernelType GetKernelTypeForVar(
+            const std::string& var_name, const Tensor& tensor,
+            const framework::OpKernelType& expected_kernel_type) const {
+        return framework::OpKernelType(expected_kernel_type.data_type_, 
+                                       tensor.place(), tensor.layout());
+    }
+};
+
+
+class MpcPoolOpGrad : public framework::OperatorWithKernel {
+public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+
+    void InferShape(framework::InferShapeContext* ctx) const override{
+        PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must not be null.");
+        PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
+                          "Input(X@GRAD) should not be null.");
+        ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+
+protected:
+    framework::OpKernelType GetExpectedKernelType(const framework::ExecutionContext& ctx) const {
+        framework::LibraryType library_{framework::LibraryType::kPlain};
+        std::string data_format = "AnyLayout";
+        framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
+        auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+        return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_);
+    }
+
+    framework::OpKernelType GetKernelTypeForVar(
+            const std::string& var_name, const Tensor& tensor,
+            const framework::OpKernelType& expected_kernel_type) const {
+        return framework::OpKernelType(expected_kernel_type.data_type_,
+                                       tensor.place(), tensor.layout());
+    }
+};
+
+
+class MpcPool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+    void Make() override{
+        AddInput("X",
+            "(Tensor) The input tensor of pooling operator. "
+            "The format of input tensor is NCHW, where N is batch size, C is the "
+            "number of channels, H is the height of the feature, "
+            "and W is the width of the feature.");
+        AddOutput("Out",
+            "(Tensor) The output tensor of pooling operator. "
+            "The format of output tensor is also NCHW, "
+            "where N is batch size, C is the number of channels, "
+            "H is the height of the feature, "
+            "and W is the width of the feature.");
+        AddOutput("One_hot_tensor", 
+            "one hot tensor");
+        AddAttr<std::string>("pooling_type",
+            "(string), pooling type, can be \"max\" for max-pooling "
+            "and \"avg\" for average-pooling.")
+            .InEnum({"max", "avg"});
+        AddAttr<std::vector<int>>("ksize",
+            "(vector<int>) The pooling window "
+            "size(height, width) of the pooling operator. "
+            "If global_pooling = true, ksize and paddings will "
+            "be ignored.");
+        AddAttr<bool>("global_pooling",
+            "(bool) Whether to use the global pooling. "
+            "If global_pooling = true, kernel size and paddings will be ignored. "
+            "Default False.")
+            .SetDefault(false);
+        AddAttr<std::vector<int>>("strides",
+             "(vector<int>, default {1, 1}), strides(height, "
+             "width) of pooling operator.")
+            .SetDefault({1, 1});
+        AddAttr<std::vector<int>>("paddings",
+            "(vector<int>, default {0,0}), paddings(height_top, height_bottom, "
+            "width_left, wifth_right) of pooling operator."
+            "If global_pooling = true, paddings and kernel size will be ignored.")
+            .SetDefault({0, 0});
+        AddAttr<bool>("exclusive",
+            "(bool) When true, will exclude the zero-padding in the "
+            "averaging calculating, otherwise, include the zero-padding. Note, it "
+            "is only used when pooling_type is avg. The default is True. "
+            "Default True.")
+            .SetDefault(true);
+        AddAttr<bool>("ceil_mode",
+            "(bool) Whether to use the ceil function to calculate "
+            "output height and width. False is the default. If it is set to False, "
+            "the floor function will be used. Default False")
+            .SetDefault(false);
+        AddAttr<std::string>("data_format",
+            "(string, default NCHW) Only used in "
+            "An optional string from: \"NHWC\", \"NCHW\". "
+            "Defaults to \"NHWC\". Specify the data format of the output data, "
+            "the input will be transformed automatically. ")
+            .SetDefault("NCHW");
+        AddAttr<bool>("is_test",
+            "(bool, default false) Set to true for inference only, false "
+            "for training. Some layers may run faster when this is true.")
+            .SetDefault(false);
+        AddAttr<std::string>("padding_algorithm",
+            "(string, default \"EXPLICIT\") An optional string from: \"EXPLICIT\","
+            "\"SAME\",\"VALID\". Set to \"EXPLICIT\" for explicit padding. "
+            "Set to \"SAME\" or \"VALID\" for algorithm of padding. ")
+           .SetDefault("EXPLICIT");
+        AddComment(R"DOC(
+This operation calculates the pooling output based on
+the input, pooling_type and pool_size, pool_stride, pool_padding parameters.
+Input(X) and Output(Out) are in NCHW or NHWC format, where N is batch size, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
+Parameters(pool_size, pool_stride, pool_padding) hold two integer elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out) size may be different.
+)DOC");
+    }
+};
+
+class MpcPoolOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+protected:
+    std::unordered_map<std::string, std::string>& GetInputOutputWithSameType() const override {
+        static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
+        return m;
+    }
+};
+
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    mpc_pool2d, ops::MpcPoolOp, ops::MpcPool2dOpMaker, ops::MpcPoolOpInferVarType,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
+REGISTER_OPERATOR(mpc_pool2d_grad, ops::MpcPoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    mpc_pool2d, ops::MpcPoolKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    mpc_pool2d_grad, ops::MpcPoolGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/core/paddlefl_mpc/operators/mpc_pool_op.h
+++ b/core/paddlefl_mpc/operators/mpc_pool_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "mpc_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename T = int>
+inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling,
+                          const std::string& padding_algorithm,
+                          const framework::DDim data_dims,
+                          const std::vector<T>& strides,
+                          const std::vector<T>& ksize) {
+    // set padding size == data_dims.size() * 2
+    auto data_shape = framework::vectorize<T>(data_dims);
+    if (static_cast<int>(paddings->size()) == data_dims.size()) {
+        for (int i = 0; i < data_dims.size(); ++i) {
+            T copy_pad = *(paddings->begin() + 2 * i);
+            paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+        }
+    } else {
+        PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(),
+                          "Paddings size should be the same or twice as the pooling size.");
+    }
+
+    // when padding_algorithm is "VALID" or "SAME"
+    if (padding_algorithm == "SAME") {
+        for (int i = 0; i < data_dims.size(); ++i) {
+            T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
+            T pad_sum = std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i], static_cast<T>(0));
+            T pad_0 = pad_sum / 2;
+            T pad_1 = pad_sum - pad_0;
+            *(paddings->begin() + i * 2) = pad_0;
+            *(paddings->begin() + i * 2 + 1) = pad_1;
+        }
+    } else if (padding_algorithm == "VALID") {
+        for (auto it = paddings->begin(); it != paddings->end(); it++) {
+            *it = 0;
+        }
+    }
+
+    // if global_pooling == true, padding will be ignore
+    if (global_pooling) {
+        for (auto it = paddings->begin(); it != paddings->end(); it++) {
+            *it = 0;
+        }
+    }
+}
+
+template <typename T = int>
+inline void UpdateKsize(std::vector<T>* ksize,
+                        const framework::DDim data_dims) {
+    ksize->resize(static_cast<size_t>(data_dims.size()));
+    for (size_t i = 0; i < ksize->size(); ++i) {
+        *(ksize->begin() + i) = static_cast<T>(data_dims[i]);
+    }
+}
+
+template <typename T, typename Func>
+void VisitDataStrideWise(DDim in_dims, DDim out_dims,
+                         std::vector<int>& ksize, std::vector<int>& strides, std::vector<int>& paddings,
+                         const T* src, T* target, int src_stride, int target_stride, Func visitor) {
+
+    const int share_size = in_dims[0];
+    const int batch_size = in_dims[1];
+    const int channel_size = in_dims[2];
+    const int input_height = in_dims[3];
+    const int input_width = in_dims[4];
+    const int out_height = out_dims[3];
+    const int out_width = out_dims[4];
+    const int out_mat_numel = out_height * out_width;
+
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int filter_numel = ksize_height * ksize_width;
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    int hstart, hend;
+    int wstart, wend;
+
+    int idx = 0;
+    while (idx++ < batch_size * channel_size) {
+        for (size_t ph = 0; ph < out_height; ++ph) {
+            hstart =  ph * stride_height - padding_height;
+            hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+
+            for (size_t pw = 0; pw < out_width; ++pw) {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+
+                visitor(ph, pw, input_height, input_width, out_height, out_width, hstart, hend,
+                        wstart, wend, src, target);
+            }
+        }
+        src += src_stride;
+        target += target_stride;
+    }
+}
+
+
+template <typename DeviceContext, typename T>
+class MpcPoolKernel : public MpcOpKernel<T> {
+public:
+    void ComputeImpl(const framework::ExecutionContext &context) const override {
+
+        const Tensor* in_x = context.Input<Tensor>("X");
+        Tensor* out = context.Output<Tensor>("Out");
+        Tensor* out_one_hot_tensor = context.Output<Tensor>("One_hot_tensor");
+
+        std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+        std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+        std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+        std::string data_format = context.Attr<std::string>("data_format"); // NCHW
+        bool global_pooling = context.Attr<bool>("global_pooling");
+        std::string padding_algorithm =
+            context.Attr<std::string>("padding_algorithm");
+
+        const T* in_x_data = in_x->data<T>();
+        T* output_data = out->mutable_data<T>(context.GetPlace());
+        T* one_hot_tensor_data = out_one_hot_tensor->mutable_data<T>(context.GetPlace());
+
+        // update paddings
+        auto in_x_dims = in_x->dims();
+        auto out_dims = out->dims();
+
+        const int input_stride = in_x_dims[3] * in_x_dims[4];
+        const int output_stride = out_dims[3] * out_dims[4];
+        const int one_hot_tensor_stride = ksize[0] * ksize[1] * out_dims[3] * out_dims[4];
+
+        // create temp tensor
+        auto& dev_ctx = context.template device_context<DeviceContext>();
+        Tensor input2col = context.AllocateTmpTensor<T, DeviceContext>(out_one_hot_tensor->dims(), dev_ctx);
+        T* input2col_data = input2col.data<T>();
+        std::fill(input2col_data, input2col_data + input2col.numel(), static_cast<T>(0));
+
+        framework::DDim data_dims;
+        data_dims = framework::slice_ddim(in_x_dims, 3, in_x_dims.size());
+
+        // update padding => h, w
+        UpdatePadding(&paddings, global_pooling, padding_algorithm,
+                      data_dims, strides, ksize);
+        if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
+            for (int i = 0; i < data_dims.size(); ++i) {
+               paddings.erase(paddings.begin() + i + 1);
+            }
+        }
+
+        if (global_pooling) {
+            UpdateKsize(&ksize, data_dims);
+        }
+
+        // share0, share1
+        const int input_plaintext_size = in_x->numel() / 2;
+        const int input2col_plaintext_size = out_one_hot_tensor->numel() / 2;
+
+        // im2col
+        auto get_im2col = [=] (int ph, int pw, int input_height, int input_width, int out_height, int out_width,
+                               int hstart, int hend, int wstart, int wend, const T* src, T* target) {
+
+            size_t out_index = ph * out_width + pw;
+            size_t offset = out_height * out_width;
+            size_t index = 0;
+            for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                    target[out_index + index * offset] = src[h * input_width + w]; // share0
+                    target[out_index + index * offset + input2col_plaintext_size] =
+                    src[h * input_width + w + input_plaintext_size]; // share1
+                    ++index;
+                }
+            }
+        };
+
+        // input2col
+        // convert in_x_data (S, B, C, H, W) into (S, B, C, filter_size * filter_size, H_output * W_output)
+        VisitDataStrideWise(in_x_dims, out_dims, ksize, strides, paddings, in_x_data, input2col_data, input_stride, one_hot_tensor_stride, get_im2col);
+
+        const T* input2col_data2 = input2col.data<T>();
+
+        // maxpooling(input2col_trans), return(max2col, out_one_hot_tensor_trans)
+        // input2col_trans: (S, filter_size * filter_size, B, C, H_output * W_output)
+        // max2col: (S, , B, C, H_output * W_output)
+        // out_one_hot_tensor_trans: (S, filter_size * filter_size, B, C, H_output * W_output)
+        Tensor input2col_trans;
+        DDim in2col_dims = input2col.dims();
+        T* input2col_trans_data = input2col_trans.mutable_data<T>(in2col_dims, context.GetPlace());
+        input2col_trans.Resize({in2col_dims[0], in2col_dims[3], in2col_dims[1], in2col_dims[2], in2col_dims[4]});
+
+        Tensor max2col;
+        max2col.ShareDataWith(*out);
+        max2col.Resize({in2col_dims[0], 1, in2col_dims[1], in2col_dims[2], in2col_dims[4]});
+
+        Tensor out_one_hot_tensor_trans;
+        out_one_hot_tensor_trans.mutable_data<T>(out_one_hot_tensor->dims(), context.GetPlace());
+        out_one_hot_tensor_trans.Resize({in2col_dims[0], in2col_dims[3], in2col_dims[1], in2col_dims[2], in2col_dims[4]});
+
+        // convert input2col (S, B, C, filter_size * filter_size, H_output * W_output) 
+        // into input2col_trans (S, filter_size * filter_size, B, C, H_output * W_output)
+        const int Rank = 5;
+        Eigen::array<int, Rank>  permute;
+        permute = {0, 3, 1, 2, 4};
+
+        auto eigen_in = framework::EigenTensor<T, Rank>::From(input2col);
+        auto eigen_out = framework::EigenTensor<T, Rank>::From(input2col_trans);
+        auto* dev = dev_ctx.eigen_device();
+        eigen_out.device(*dev) = eigen_in.shuffle(permute);
+
+        // maxpooling
+        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->max_pooling(
+            &input2col_trans, &max2col, &out_one_hot_tensor_trans);
+
+        permute = {0, 2, 3, 1, 4};
+
+        // convert out_one_hot_tensor_trans: (S, filter_size * filter_size, B, C, H_output * W_output)
+        // into out_one_hot_tensor (S, B, C, filter_size * filter_size, H_output * W_output)
+        auto eigen_in2 = framework::EigenTensor<T, Rank>::From(out_one_hot_tensor_trans);
+        auto eigen_out2 = framework::EigenTensor<T, Rank>::From(*out_one_hot_tensor);
+        eigen_out2.device(*dev) = eigen_in2.shuffle(permute);
+
+        // convert max2col: (S, 1, B, C, H_output * W_output)
+        // into out_one_hot_tensor (S, B, C, 1, H_output * W_output)
+        auto eigen_in3 = framework::EigenTensor<T, Rank>::From(max2col);
+
+        // flatten height & width
+        auto flatten_out_dims = out_dims;
+        flatten_out_dims[3] = 1;
+        flatten_out_dims[4] = out_dims[3] * out_dims[4];
+        out->Resize(flatten_out_dims);
+
+        auto eigen_out3 = framework::EigenTensor<T, Rank>::From(*out);
+        eigen_out3.device(*dev) = eigen_in3.shuffle(permute);
+
+        // reshape out (S, 1, B, C, H_output * W_output)
+        // into (S, B, C, H_output * W_output)
+        out->Resize(out_dims);
+    }
+};
+
+
+template <typename DeviceContext, typename T>
+class MpcPoolGradKernel : public MpcOpKernel<T> {
+public:
+    void ComputeImpl(const framework::ExecutionContext &context) const override {
+
+        const Tensor* one_hot_tensor = context.Input<Tensor>("One_hot_tensor");
+        const Tensor* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+        Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+
+        std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+        std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+        std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+        std::string data_format = context.Attr<std::string>("data_format"); // NCHW
+        bool global_pooling = context.Attr<bool>("global_pooling");
+        std::string padding_algorithm =
+            context.Attr<std::string>("padding_algorithm");
+
+        if (in_x_grad) {
+            // update padding => h, w
+            auto in_x_dims = in_x_grad->dims();
+            auto out_dims = out_grad->dims();
+            framework::DDim data_dims;
+            data_dims = framework::slice_ddim(in_x_dims, 3, in_x_dims.size());
+
+            UpdatePadding(&paddings, global_pooling, padding_algorithm,
+                          data_dims, strides, ksize);
+            if (data_dims.size() * 2 == static_cast<int>(paddings.size())) {
+              for (int i = 0; i < data_dims.size(); ++i) {
+                paddings.erase(paddings.begin() + i + 1);
+              }
+            }
+
+            if (global_pooling) {
+              UpdateKsize(&ksize, data_dims);
+            }
+
+            // create temp tensor
+            auto& dev_ctx = context.template device_context<DeviceContext>();
+            Tensor expanded_out_grad_tensor =
+                context.AllocateTmpTensor<T, DeviceContext>(one_hot_tensor->dims(), dev_ctx);
+            Tensor mul_result_tensor =
+                context.AllocateTmpTensor<T, DeviceContext>(one_hot_tensor->dims(), dev_ctx);
+
+            // create data var of input and output variable
+            T* in_x_grad_data = in_x_grad->mutable_data<T>(context.GetPlace());
+            std::fill(in_x_grad_data, in_x_grad_data + in_x_grad->numel(), static_cast<T>(0));
+            const T* one_hot_tensor_data = one_hot_tensor->data<T>();
+            const T* out_grad_data = out_grad->data<T>();
+            T* expanded_out_grad_data = expanded_out_grad_tensor.data<T>();
+            T* mul_result_data = mul_result_tensor.data<T>();
+
+            const int filter_numel = ksize[0] * ksize[1];
+
+            // stride = h * w
+            const int input_stride = in_x_dims[3] * in_x_dims[4];
+            const int output_stride = out_dims[3] * out_dims[4];
+            const int one_hot_tensor_stride = ksize[0] * ksize[1] * out_dims[3] * out_dims[4];
+
+            // stride: share0, share1
+            const int input_plaintext_size = in_x_grad->numel() / 2;
+            const int output_plaintext_size = out_grad->numel() / 2;
+            const int one_hot_tensor_plaintext_size = one_hot_tensor->numel() / 2;
+
+            // expand out grad
+            auto get_expand_out_grad = [=] (int ph, int pw, int input_height, int input_width,
+                                            int out_height, int out_width, int hstart, int hend,
+                                            int wstart, int wend, const T* src, T* target) {
+
+                size_t out_grad_index = ph * out_width + pw;
+                size_t offset = out_height * out_width;
+                 
+                for (size_t index = 0; index < filter_numel; ++index) {
+                    target[out_grad_index + index * offset] = src[out_grad_index]; //share0
+                    target[out_grad_index + index * offset + one_hot_tensor_plaintext_size] =
+                        src[out_grad_index + output_plaintext_size]; // share1
+                }
+            };
+
+            // expand [S, B, C, H_poolout, W_poolout] into [S, B, C, ksize * ksize, H_poolout*W_poolout]
+            VisitDataStrideWise(in_x_dims, out_dims, ksize, strides, paddings, out_grad_data,
+                                expanded_out_grad_data, output_stride, one_hot_tensor_stride, get_expand_out_grad);
+
+            // compute mul result = out_grad.expand * one_hot_tensor
+            mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->arith_bool_mul(
+                &expanded_out_grad_tensor, one_hot_tensor, &mul_result_tensor);
+
+
+            // updata input X's grad
+            auto update_in_grad = [=] (int ph, int pw,
+                                       int input_height, int input_width,
+                                       int out_height, int out_width,
+                                       int hstart, int hend, int wstart, int wend,
+                                       const T* src, T* target) {
+
+                size_t index = 0;
+                size_t in_pos = 0;
+                size_t out_grad_index = ph * out_width + pw;
+                size_t res_offset = out_height * out_width;
+                for (int h = hstart; h < hend; ++h) {
+                    for (int w = wstart; w < wend; ++w) {
+                         in_pos = h * input_width + w;
+                         target[in_pos] += src[out_grad_index + index * res_offset]; // share0
+                         target[in_pos + input_plaintext_size] +=
+                             src[out_grad_index + index * res_offset + one_hot_tensor_plaintext_size]; // share1
+                         ++index;
+                    }
+                }
+            };
+            // convert [S, B, C, filter_size * filter_size, ] into [S, B, C, H, W]
+            VisitDataStrideWise(in_x_dims, out_dims, ksize, strides, paddings, mul_result_data,
+                                in_x_grad_data, one_hot_tensor_stride, input_stride, update_in_grad);
+
+        } //if (in_x_grad)
+    } // void ComputeImpl
+}; // class MpcPooliGradKernel
+
+}  // namespace operators
+}  // namespace paddle
--- a/core/paddlefl_mpc/operators/mpc_relu_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_relu_op.cc
@@ -25,7 +25,8 @@ class MpcReluOp : public framework::OperatorWithKernel {

  void InferShape(framework::InferShapeContext* ctx) const override {
    auto in_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Y", in_dims);
+    ctx->SetOutputDim("Out", in_dims);
+    ctx->SetOutputDim("Derivative", in_dims);
  }
 };

@@ -34,7 +35,8 @@ class MpcReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X", "The input tensor.");
-    AddOutput("Y", "Output of relu_op");
+    AddOutput("Out", "Output of relu_op");
+    AddOutput("Derivative", "Derivative of relu_op");
    AddComment(R"DOC(
 Mpc Relu Operator.
 )DOC");
@@ -47,7 +49,7 @@ class MpcReluGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    auto in_dims = ctx->GetInputDim(framework::GradVarName("Out"));
    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
  }
 };
@@ -61,8 +63,9 @@ public:
 protected:
    void Apply(GradOpPtr<T> grad) const override {
        grad->SetType("mpc_relu_grad");
-        grad->SetInput("Y", this->Output("Y"));
-        grad->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+        grad->SetInput("Out", this->Output("Out"));
+        grad->SetInput("Derivative", this->Output("Derivative"));
+        grad->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
        grad->SetAttrMap(this->Attrs());
        grad->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
    }

--- a/core/paddlefl_mpc/operators/mpc_relu_op.h
+++ b/core/paddlefl_mpc/operators/mpc_relu_op.h
@@ -25,11 +25,14 @@ class MpcReluKernel : public MpcOpKernel<T> {
 public:
    void ComputeImpl(const framework::ExecutionContext& ctx) const override {
        const Tensor* in_t = ctx.Input<Tensor>("X");
-        Tensor* out_t = ctx.Output<Tensor>("Y");
+        Tensor* out_t = ctx.Output<Tensor>("Out");
+        Tensor* der_t = ctx.Output<Tensor>("Derivative");
        auto x = in_t->data<T>();
        auto y = out_t->mutable_data<T>(ctx.GetPlace());
+        auto der = der_t->mutable_data<T>(ctx.GetPlace());
        PADDLE_ENFORCE_NOT_NULL(mpc::MpcInstance::mpc_protocol, "Protocol %s is not yet created in MPC Protocol.");
-        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->relu(in_t,out_t);
+        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()
+            ->relu_with_derivative(in_t,out_t, der_t);
  }
 };

@@ -38,11 +41,12 @@ template <typename DeviceContext, typename T>
 class MpcReluGradKernel : public MpcOpKernel<T> {
 public:
    void ComputeImpl(const framework::ExecutionContext& ctx) const override {
-        auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-        auto* y_t = ctx.Input<Tensor>("Y");
+        auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
+        auto* y_t = ctx.Input<Tensor>("Out");
+        auto* der_t = ctx.Input<Tensor>("Derivative");
        auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
        auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->relu_grad(y_t, dy_t, dx_t, 0.0);
+        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->arith_bool_mul(dy_t, der_t, dx_t);
    }
 };


--- a/core/paddlefl_mpc/operators/mpc_sgd_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_sgd_op.cc
@@ -72,12 +72,6 @@ public:
                     " but the received var(%s)'s type is %s",
                     ctx->InputVarName("Param"), in_var_type);
      ctx->SetOutputType("ParamOut", in_var_type);
-
-      //for (auto &out_var_n : framework::StaticGraphVarTypeInference::Output(ctx, "ParamOut")) {
-      //  if (ctx->GetVarType(out_var_n) != in_var_type) {
-      //    ctx->SetType(out_var_n, in_var_type);
-      //}
-      //}
  }
 };

@@ -111,4 +105,4 @@ REGISTER_OPERATOR(
    ops::MpcSGDOpInferVarType);
 REGISTER_OP_CPU_KERNEL(
    mpc_sgd,
-    ops::MpcSGDOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::MpcSGDOpKernel<paddle::platform::CPUDeviceContext, int64_t, float>);
--- a/core/paddlefl_mpc/operators/mpc_sgd_op.h
+++ b/core/paddlefl_mpc/operators/mpc_sgd_op.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, typename T1>
 class MpcSGDOpKernel : public MpcOpKernel<T> {
  public:
    void ComputeImpl(const framework::ExecutionContext &ctx) const override{
@@ -47,14 +47,14 @@ class MpcSGDOpKernel : public MpcOpKernel<T> {
        PADDLE_ENFORCE_EQ(param->numel(), sz);
        PADDLE_ENFORCE_EQ(grad->numel(), sz);

-        const double *lr = learning_rate->data<double>();
+        double lr = *learning_rate->data<T1>();

        param_out->mutable_data<T>(ctx.GetPlace());
        PADDLE_ENFORCE_NOT_NULL(mpc::MpcInstance::mpc_protocol, "Protocol %s is not yet created in MPC Protocol.");
        // update parameters
        framework::Tensor temp;
        temp.mutable_data<T>(param->dims(), ctx.GetPlace());
-        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(grad, lr[0], &temp);
+        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->scale(grad, lr, &temp);
        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->sub(param, &temp, param_out);
    }
 };

--- a/core/paddlefl_mpc/operators/mpc_softmax_with_cross_entropy_op.cc
+++ b/core/paddlefl_mpc/operators/mpc_softmax_with_cross_entropy_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "mpc_softmax_with_cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MpcSoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
+public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+
+    void InferShape(framework::InferShapeContext* ctx) const override {
+        PADDLE_ENFORCE_EQ(
+            ctx->HasInput("Logits"), true,
+            platform::errors::InvalidArgument("Input(Logits) should be not null."));
+        PADDLE_ENFORCE_EQ(
+            ctx->HasInput("Label"), true,
+            platform::errors::InvalidArgument("Input(Label) should be not null."));
+
+        PADDLE_ENFORCE_EQ(ctx->HasOutput("Softmax"), true,
+                          platform::errors::InvalidArgument(
+                              "Output(Softmax) should be not null."));
+        PADDLE_ENFORCE_EQ(
+            ctx->HasOutput("Loss"), true,
+            platform::errors::InvalidArgument("Output(Loss) should be not null."));
+
+        auto axis = ctx->Attrs().Get<int>("axis");
+        auto logits_dims = ctx->GetInputDim("Logits");
+        auto labels_dims = ctx->GetInputDim("Label");
+        auto logits_rank = logits_dims.size();
+
+        axis = CanonicalAxis(axis, logits_rank);
+        PADDLE_ENFORCE_GE(axis, logits_rank - 1,
+                          platform::errors::InvalidArgument(
+                              "Attr(axis) value should be -1 or R-1, "
+                              "R is the rank of Input(Logits)."));
+        for (int i = 0; i < logits_rank; i++) {
+            if (i != axis) {
+                if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
+                    PADDLE_ENFORCE_EQ(logits_dims[i], labels_dims[i],
+                                      platform::errors::InvalidArgument(
+                                          "Input(Logits) and Input(Label) should in "
+                                          "same shape in dimensions except axis."));
+                }
+            }
+        }
+
+        bool soft_label = ctx->Attrs().Get<bool>("soft_label");
+        PADDLE_ENFORCE_EQ(soft_label, true,
+                          platform::errors::InvalidArgument(
+                              "soft_label can only be true! "));
+        if (soft_label) {
+            if (ctx->IsRuntime() ||
+                    (logits_dims[axis] > 0 && labels_dims[axis] > 0)) {
+                PADDLE_ENFORCE_EQ(logits_dims[axis], labels_dims[axis],
+                                  platform::errors::InvalidArgument(
+                                      "If Attr(soft_label) == true,  "
+                                      "the axis dimension of "
+                                      "Input(X) and Input(Label) should be equal."));
+            }
+        }
+        ctx->SetOutputDim("Softmax", logits_dims);
+
+        logits_dims[axis] = 1;
+        ctx->SetOutputDim("Loss", logits_dims);
+
+        ctx->ShareLoD("Logits", /*->*/ "Softmax");
+        ctx->ShareLoD("Logits", /*->*/ "Loss");
+    }
+};
+
+
+class MpcSoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
+public:
+    using framework::OperatorWithKernel::OperatorWithKernel;
+
+    void InferShape(framework::InferShapeContext* ctx) const override {
+        PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")), true,
+                          platform::errors::InvalidArgument(
+                              "Input(Loss@Grad) should not be null."));
+        PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true,
+                          platform::errors::InvalidArgument(
+                              "Input(Softmax) should be not null."));
+        PADDLE_ENFORCE_EQ(
+            ctx->HasInput("Label"), true,
+            platform::errors::InvalidArgument("Input(Label) should be not null."));
+        PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")), true,
+                          platform::errors::InvalidArgument(
+                              "Output(Logits@Grad) should be not null."));
+
+        auto axis = ctx->Attrs().Get<int>("axis");
+        auto softmax_dims = ctx->GetInputDim("Softmax");
+        auto labels_dims = ctx->GetInputDim("Label");
+        auto softmax_rank = softmax_dims.size();
+
+        axis = CanonicalAxis(axis, softmax_rank);
+        PADDLE_ENFORCE_GE(axis, softmax_rank - 1,
+                          platform::errors::InvalidArgument(
+                              "Attr(axis) value should be -1 or R-1, "
+                              "R is the rank of Input(Logits)."));
+        for (int i = 0; i < softmax_rank; i++) {
+            if (i != axis) {
+                if (ctx->IsRuntime() || (softmax_dims[i] > 0 && labels_dims[i] > 0)) {
+                    PADDLE_ENFORCE_EQ(
+                        softmax_dims[i], labels_dims[i],
+                        platform::errors::InvalidArgument(
+                            "Input(Logits) and Input(Label) should in same shape in "
+                            "dimensions except axis."));
+                }
+            }
+        }
+
+        bool soft_label = ctx->Attrs().Get<bool>("soft_label");
+        PADDLE_ENFORCE_EQ(soft_label, true,
+                          platform::errors::InvalidArgument(
+                              "soft_label can only be true! "));
+        if (soft_label) {
+            if (ctx->IsRuntime() || (softmax_dims[axis] > 0 && labels_dims[axis] > 0)) {
+                PADDLE_ENFORCE_EQ(softmax_dims[axis], labels_dims[axis],
+                                  platform::errors::InvalidArgument(
+                                      "If Attr(soft_label) == true, "
+                                      "the axis dimension of "
+                                      "Input(X) and Input(Label) should be equal."));
+            }
+        }
+
+        ctx->SetOutputDim(framework::GradVarName("Logits"),
+                          ctx->GetInputDim("Softmax"));
+    }
+};
+
+
+class MpcSoftmaxWithCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+    void Make() override {
+        AddInput("Logits",
+                 "(Tensor, default: Tensor<float>), The input tensor of unscaled "
+                 "log probabilities, whose dimension :attr:`axis` should be scaled "
+                 "by softmax.");
+        AddInput(
+            "Label",
+            "(Tensor) The input tensor of groud truth label. If :attr:`soft_label` "
+            "is set to false, Label is a Tensor<int64> in same shape with "
+            "Input(Logits) except the shape in dimension :attr:`axis` as 1. If "
+            "soft_label is set to true, Label is a Tensor<float/double> in same "
+            "shape with Input(Logits).");
+        AddOutput(
+            "Softmax",
+            "(Tensor, default: Tensor<float>), A tensor in same shape with "
+            "Input(Logits). "
+            "The outputs value of softmax activation by given the input batch, "
+            "which will be used in backward calculation.")
+            .AsIntermediate();
+        AddOutput("Loss",
+                  "(Tensor, default: Tensor<float>), A tensor in same shape with "
+                  "Input(Logits) "
+                  "except the shape in dimension :attr:`axis` as 1. The cross "
+                  "entropy loss.");
+        AddAttr<bool>(
+            "soft_label",
+            "(bool, default: false), A flag to indicate whether to interpretant "
+            "the given labels as soft labels.")
+            .SetDefault(false);
+        AddAttr<int>("axis",
+                     "The dimension index of Input(Logits) to perform softmax,"
+                     "default -1 for last dimension")
+            .SetDefault(-1);
+        AddAttr<bool>("use_relu", "").SetDefault(false);
+        AddAttr<bool>("use_long_div", "").SetDefault(true);
+        AddComment(R"DOC(
+Softmax With Cross Entropy Operator.
+Cross entropy loss with softmax is used as the output layer extensively. This
+operator computes the softmax normalized values for each row of the input
+tensor. 
+Conputing cross-entropy loss is not supported now.
+Now, we only support soft_label=true, axis=-1 or (rank-1).
+Forward: out = softmax(x). todo: add cross_entropy
+backward: dx = dout.expand * (softmax(x) - label)
+)DOC");
+    }
+};
+
+
+template <typename T>
+class MpcSoftmaxGradMaker : public framework::SingleGradOpMaker<T> {
+public:
+    using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+protected:
+    void Apply(GradOpPtr<T> grad_op) const override {
+        grad_op->SetType("mpc_softmax_with_cross_entropy_grad");
+        grad_op->SetInput("Label", this->Input("Label"));
+        grad_op->SetInput("Softmax", this->Output("Softmax"));
+        grad_op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
+        grad_op->SetOutput(framework::GradVarName("Logits"),
+                           this->InputGrad("Logits"));
+        grad_op->SetAttrMap(this->Attrs());
+    }
+};
+
+
+DECLARE_INPLACE_OP_INFERER(MpcSoftmaxWithCrossEntropyInplaceInference,
+                           {"Logits", "Softmax"});
+
+DECLARE_INPLACE_OP_INFERER(MpcSoftmaxWithCrossEntropyGradInplaceInference,
+                           {"Softmax", framework::GradVarName("Logits")});
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(mpc_softmax_with_cross_entropy, ops::MpcSoftmaxWithCrossEntropyOp,
+                  ops::MpcSoftmaxWithCrossEntropyOpMaker,
+                  ops::MpcSoftmaxGradMaker<paddle::framework::OpDesc>,
+                  ops::MpcSoftmaxGradMaker<paddle::imperative::OpBase>,
+                  ops::MpcSoftmaxWithCrossEntropyInplaceInference);
+REGISTER_OPERATOR(mpc_softmax_with_cross_entropy_grad,
+                  ops::MpcSoftmaxWithCrossEntropyOpGrad,
+                  ops::MpcSoftmaxWithCrossEntropyGradInplaceInference);
+REGISTER_OP_CPU_KERNEL(mpc_softmax_with_cross_entropy,
+                       ops::MpcSoftmaxWithCrossEntropyKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(mpc_softmax_with_cross_entropy_grad,
+                       ops::MpcSoftmaxWithCrossEntropyGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
--- a/core/paddlefl_mpc/operators/mpc_softmax_with_cross_entropy_op.h
+++ b/core/paddlefl_mpc/operators/mpc_softmax_with_cross_entropy_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "mpc_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+
+static inline int SizeToAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeFromAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+
+// Out = softmax(Logits) = relu(Logits_i) / sum(relu(Logits_i)): prediction of input. 
+// todo: loss=? 
+template <typename DeviceContext, typename T>
+class MpcSoftmaxWithCrossEntropyKernel : public MpcOpKernel<T> {
+public:
+    void ComputeImpl(const framework::ExecutionContext &ctx) const override {
+        auto *in_x_t = ctx.Input<Tensor>("Logits");
+        auto *out_softmax_t = ctx.Output<Tensor>("Softmax");
+        auto *out_loss_t = ctx.Output<Tensor>("Loss");
+        out_softmax_t->mutable_data<T>(ctx.GetPlace());
+        out_loss_t->mutable_data<T>(ctx.GetPlace());
+        bool use_relu = ctx.Attr<bool>("use_relu");
+        bool use_long_div = ctx.Attr<bool>("use_long_div");
+
+        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->softmax(
+            in_x_t, out_softmax_t, use_relu, use_long_div);
+    }
+};
+
+// dx = dout.expand * (softmax(x) - labels)
+template <typename DeviceContext, typename T>
+class MpcSoftmaxWithCrossEntropyGradKernel : public MpcOpKernel<T> {
+public:
+    void ComputeImpl(const framework::ExecutionContext &ctx) const override {
+        auto *dout = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+        auto *in_label_t = ctx.Input<Tensor>("Label");
+        auto *in_softmax_t = ctx.Input<Tensor>("Softmax");
+        auto *dx = ctx.Output<Tensor>(framework::GradVarName("Logits"));
+        const bool soft_label = ctx.Attr<bool>("soft_label");
+        PADDLE_ENFORCE_EQ(soft_label, true, "soft_label can only be true.");
+
+        const int rank = dx->dims().size();
+        const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+        int axis_dim = dx->dims()[axis];
+        const int n = SizeToAxis(axis, dx->dims());
+        const int d = SizeFromAxis(axis, dx->dims());
+
+        T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+        const T* dout_data = dout->data<T>();
+
+        // expand dout
+        Tensor dout_expand;
+        T* dout_expand_data = dout_expand.mutable_data<T>(dx->dims(), ctx.GetPlace());
+
+        for (size_t i = 0; i < n; ++i) {
+            for (size_t j = 0; j < d; ++j) {
+                dout_expand_data[i * d + j] = dout_data[i];
+            }
+        }
+
+        // dx = dout.expand * (softmax - label)
+        Tensor softmax_minus_label;
+        T* softmax_minus_label_data = softmax_minus_label.mutable_data<T>(dx->dims(), ctx.GetPlace());
+        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->sub(in_softmax_t, in_label_t, &softmax_minus_label);
+        mpc::MpcInstance::mpc_instance()->mpc_protocol()->mpc_operators()->mul(&dout_expand, &softmax_minus_label, dx);
+    }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/core/privc/privc_context.h
+++ b/core/privc/privc_context.h
@@ -39,9 +39,9 @@ public:

  void init(size_t party, std::shared_ptr<AbstractNetwork> network, block seed,
            block seed2) override {
+    set_num_party(2);
    set_party(party);
    set_network(network);
-    set_num_party(2);

    if (psi::equals(seed2, psi::g_zero_block)) {
      seed2 = psi::block_from_dev_urandom();

--- a/core/privc3/CMakeLists.txt
+++ b/core/privc3/CMakeLists.txt
-add_compile_options(-msse4.2 -maes)
-
 set(PRIVC3_SRCS
    "./aes.cc"
    "./paddle_tensor.cc"
@@ -8,16 +6,23 @@ set(PRIVC3_SRCS
    "./tensor_adapter_factory.cc"
 )

+if (USE_AES_NI)
+    add_compile_definitions(USE_AES_NI)
+endif (USE_AES_NI)
+
 add_library(privc3_o OBJECT ${PRIVC3_SRCS})
 add_dependencies(privc3_o fluid_framework)

 add_library(privc3 STATIC $<TARGET_OBJECTS:privc3_o>)
-target_link_libraries(privc3 fluid_framework)
+
+if (USE_OPENMP)
+    target_link_libraries(privc3 fluid_framework OpenMP::OpenMP_CXX OpenMP::OpenMP_C crypto)
+else()
+    target_link_libraries(privc3 fluid_framework crypto)
+endif (USE_OPENMP)

 cc_test(fixedpoint_util_test SRCS fixedpoint_util_test.cc DEPS privc3)
 cc_test(paddle_tensor_test SRCS paddle_tensor_test.cc DEPS privc3)
 cc_test(boolean_tensor_test SRCS boolean_tensor_test.cc DEPS privc3)

 cc_test(fixedpoint_tensor_test SRCS fixedpoint_tensor_test.cc DEPS privc3)
-
-#set(CMAKE_BUILD_TYPE "Debug")
--- a/core/privc3/aby3_context.h
+++ b/core/privc3/aby3_context.h
@@ -40,9 +40,9 @@ public:

  void init(size_t party, std::shared_ptr<AbstractNetwork> network, block seed,
            block seed2) override {
+    set_num_party(3);
    set_party(party);
    set_network(network);
-    set_num_party(3);

    if (psi::equals(seed, psi::g_zero_block)) {
      seed = psi::block_from_dev_urandom();

--- a/core/privc3/boolean_tensor.h
+++ b/core/privc3/boolean_tensor.h
@@ -23,33 +23,35 @@

 namespace aby3 {

-template <typename T, size_t N> class FixedPointTensor;
+template<typename T, size_t N>
+class FixedPointTensor;

-template <typename T> class BooleanTensor {
+template<typename T>
+class BooleanTensor {

 public:
-  BooleanTensor(TensorAdapter<T> *share_tensor[2]);
+    BooleanTensor(TensorAdapter<T>* share_tensor[2]);

-  BooleanTensor(TensorAdapter<T> *tensor0, TensorAdapter<T> *tensor1);
+    BooleanTensor(TensorAdapter<T>* tensor0, TensorAdapter<T>* tensor1);

    BooleanTensor();

    // ABY3 a2b
-  template <size_t N>
-  BooleanTensor &operator=(const FixedPointTensor<T, N> *other);
+    template<size_t N>
+    BooleanTensor& operator=(const FixedPointTensor<T, N>* other);

    ~BooleanTensor() {}

-  // get share
-  TensorAdapter<T> *share(size_t idx);
+    //get share
+    TensorAdapter<T>* share(size_t idx);

-  const TensorAdapter<T> *share(size_t idx) const;
+    const TensorAdapter<T>* share(size_t idx) const;

    // reveal boolean tensor to one party
-  void reveal_to_one(size_t party_num, TensorAdapter<T> *ret) const;
+    void reveal_to_one(size_t party_num, TensorAdapter<T>* ret) const;

    // reveal boolean tensor to all parties
-  void reveal(TensorAdapter<T> *ret) const;
+    void reveal(TensorAdapter<T>* ret) const;

    const std::vector<size_t> shape() const;

@@ -61,59 +63,63 @@ public:
    //                   const std::string& rnd_seed = "");

    // element-wise xor with BooleanTensor
-  void bitwise_xor(const BooleanTensor *rhs, BooleanTensor *ret) const;
+    void bitwise_xor(const BooleanTensor* rhs, BooleanTensor* ret) const;

    // element-wise xor with TensorAdapter
-  void bitwise_xor(const TensorAdapter<T> *rhs, BooleanTensor *ret) const;
+    void bitwise_xor(const TensorAdapter<T>* rhs, BooleanTensor* ret) const;

    // element-wise and with BooleanTensor
-  void bitwise_and(const BooleanTensor *rhs, BooleanTensor *ret) const;
+    void bitwise_and(const BooleanTensor* rhs, BooleanTensor* ret) const;

    // element-wise and with TensorAdapter
-  void bitwise_and(const TensorAdapter<T> *rhs, BooleanTensor *ret) const;
+    void bitwise_and(const TensorAdapter<T>* rhs, BooleanTensor* ret) const;

-  // element-wise or with BooleanTensor
-  void bitwise_or(const BooleanTensor *rhs, BooleanTensor *ret) const;
-
-  // element-wise or with TensorAdapter
-  void bitwise_or(const TensorAdapter<T> *rhs, BooleanTensor *ret) const;
+    // element-wise or
+    // for both tensor adapter and boolean tensor
+    template<template<typename U> class CTensor>
+    void bitwise_or(const CTensor<T>* rhs, BooleanTensor* ret) const;

    // element-wise not
-  void bitwise_not(BooleanTensor *ret) const;
+    void bitwise_not(BooleanTensor* ret) const;

    // element-wise lshift
-  void lshift(size_t rhs, BooleanTensor *ret) const;
+    void lshift(size_t rhs, BooleanTensor* ret) const;

    // element-wise rshift
-  void rshift(size_t rhs, BooleanTensor *ret) const;
+    void rshift(size_t rhs, BooleanTensor* ret) const;

    // element-wise logical_rshift
-  void logical_rshift(size_t rhs, BooleanTensor *ret) const;
+    void logical_rshift(size_t rhs, BooleanTensor* ret) const;

    // element-wise ppa with BooleanTensor
-  void ppa(const BooleanTensor *rhs, BooleanTensor *ret, size_t nbits) const;
+    void ppa(const BooleanTensor* rhs, BooleanTensor*ret , size_t nbits) const;

    // ABY3 b2a
-  template <size_t N> void b2a(FixedPointTensor<T, N> *ret) const;
+    template<size_t N>
+    void b2a(FixedPointTensor<T, N>* ret) const;

    // ABY3 ab mul
    // this is an one-bit boolean share
-  template <size_t N>
-  void mul(const TensorAdapter<T> *rhs, FixedPointTensor<T, N> *ret,
-           size_t rhs_party) const;
+    template<size_t N>
+    void mul(const TensorAdapter<T>* rhs, FixedPointTensor<T, N>* ret, size_t rhs_party) const;

    // ABY3 ab mul
    // this is an one-bit boolean share
-  template <size_t N>
-  void mul(const FixedPointTensor<T, N> *rhs,
-           FixedPointTensor<T, N> *ret) const;
+    template<size_t N>
+    void mul(const FixedPointTensor<T, N>* rhs, FixedPointTensor<T, N>* ret) const;

    // extract to this
-  template <size_t N>
-  void bit_extract(size_t i, const FixedPointTensor<T, N> *in);
+    template<size_t N>
+    void bit_extract(size_t i, const FixedPointTensor<T, N>* in);

    // extract from this to ret
-  void bit_extract(size_t i, BooleanTensor *ret) const;
+    void bit_extract(size_t i, BooleanTensor* ret) const;
+
+    // turn all 1s to 0s except the last 1 in a col
+    // given cmp result from max pooling, generate one hot tensor
+    // indicating which element is max
+    // inplace transform
+    void onehot_from_cmp();

 private:
  static inline std::shared_ptr<AbstractContext> aby3_ctx() {
@@ -131,9 +137,10 @@ private:
    size_t party() const;

 private:
-  TensorAdapter<T> *_share[2];
+    TensorAdapter<T>* _share[2];
+
 };

-} // namespace aby3
+} //namespace aby3

 #include "boolean_tensor_impl.h"
--- a/core/privc3/boolean_tensor_impl.h
+++ b/core/privc3/boolean_tensor_impl.h
@@ -18,49 +18,54 @@

 namespace aby3 {

-template <typename T> size_t BooleanTensor<T>::pre_party() const {
+template<typename T>
+size_t BooleanTensor<T>::pre_party() const {
    return aby3_ctx()->pre_party();
 }

-template <typename T> size_t BooleanTensor<T>::next_party() const {
+template<typename T>
+size_t BooleanTensor<T>::next_party() const {
    return aby3_ctx()->next_party();
 }

-template <typename T> size_t BooleanTensor<T>::party() const {
+template<typename T>
+size_t BooleanTensor<T>::party() const {
    return aby3_ctx()->party();
 }

-template <typename T>
-BooleanTensor<T>::BooleanTensor(TensorAdapter<T> *tensor[2]) {
+template<typename T>
+BooleanTensor<T>::BooleanTensor(TensorAdapter<T>* tensor[2]) {
    // TODO: check if tensor shape equal
    _share[0] = tensor[0];
    _share[1] = tensor[1];
 }

-template <typename T>
-BooleanTensor<T>::BooleanTensor(TensorAdapter<T> *tensor0,
-                                TensorAdapter<T> *tensor1) {
+template<typename T>
+BooleanTensor<T>::BooleanTensor(TensorAdapter<T>* tensor0,
+                                TensorAdapter<T>* tensor1) {
    // TODO: check if tensor shape equal
    _share[0] = tensor0;
    _share[1] = tensor1;
 }

-template <typename T> BooleanTensor<T>::BooleanTensor() {}
+template<typename T>
+BooleanTensor<T>::BooleanTensor() {
+}

-template <typename T> TensorAdapter<T> *BooleanTensor<T>::share(size_t idx) {
+template<typename T>
+TensorAdapter<T>* BooleanTensor<T>::share(size_t idx) {
    // TODO: check if idx < 2
    return _share[idx];
 }

-template <typename T>
-const TensorAdapter<T> *BooleanTensor<T>::share(size_t idx) const {
+template<typename T>
+const TensorAdapter<T>* BooleanTensor<T>::share(size_t idx) const {
    // TODO: check if idx < 2
    return _share[idx];
 }

-template <typename T>
-void BooleanTensor<T>::reveal_to_one(size_t party_num,
-                                     TensorAdapter<T> *ret) const {
+template<typename T>
+void BooleanTensor<T>::reveal_to_one(size_t party_num, TensorAdapter<T>* ret) const {

    if (party_num == party()) {
        // TODO: check if tensor shape equal
@@ -75,50 +80,54 @@ void BooleanTensor<T>::reveal_to_one(size_t party_num,
    } else if (party_num == next_party()) {

        aby3_ctx()->network()->template send(party_num, *share(0));
+
    }
 }

-template <typename T>
-void BooleanTensor<T>::reveal(TensorAdapter<T> *ret) const {
+template<typename T>
+void BooleanTensor<T>::reveal(TensorAdapter<T>* ret) const {
    for (size_t idx = 0; idx < 3; ++idx) {
        reveal_to_one(idx, ret);
    }
 }

-template <typename T>
+template<typename T>
 const std::vector<size_t> BooleanTensor<T>::shape() const {
    if (share(0)) {
        return share(0)->shape();
-  } else {
+    }
+    else {
        return std::vector<size_t>();
    }
 }

-template <typename T> size_t BooleanTensor<T>::numel() const {
+template<typename T>
+size_t BooleanTensor<T>::numel() const {
    if (share(0)) {
        return share(0)->numel();
-  } else {
+    }
+    else {
        0;
    }
 }

-template <typename T>
-void BooleanTensor<T>::bitwise_xor(const BooleanTensor *rhs,
-                                   BooleanTensor *ret) const {
+template<typename T>
+void BooleanTensor<T>::bitwise_xor(const BooleanTensor* rhs,
+                                   BooleanTensor* ret) const {
    share(0)->bitwise_xor(rhs->share(0), ret->share(0));
    share(1)->bitwise_xor(rhs->share(1), ret->share(1));
 }

-template <typename T>
-void BooleanTensor<T>::bitwise_xor(const TensorAdapter<T> *rhs,
-                                   BooleanTensor *ret) const {
+template<typename T>
+void BooleanTensor<T>::bitwise_xor(const TensorAdapter<T>* rhs,
+                                   BooleanTensor* ret) const {
    share(0)->bitwise_xor(rhs, ret->share(0));
    share(1)->bitwise_xor(rhs, ret->share(1));
 }

-template <typename T>
-void BooleanTensor<T>::bitwise_and(const BooleanTensor *rhs,
-                                   BooleanTensor *ret) const {
+template<typename T>
+void BooleanTensor<T>::bitwise_and(const BooleanTensor* rhs,
+                                   BooleanTensor* ret) const {

    auto tmp_zero = tensor_factory()->template create<T>(ret->shape());
    auto tmp0 = tensor_factory()->template create<T>(ret->shape());
@@ -149,37 +158,36 @@ void BooleanTensor<T>::bitwise_and(const BooleanTensor *rhs,
    }
 }

-template <typename T>
-void BooleanTensor<T>::bitwise_and(const TensorAdapter<T> *rhs,
-                                   BooleanTensor *ret) const {
+template<typename T>
+void BooleanTensor<T>::bitwise_and(const TensorAdapter<T>* rhs,
+                                   BooleanTensor* ret) const {
    share(0)->bitwise_and(rhs, ret->share(0));
    share(1)->bitwise_and(rhs, ret->share(1));
 }

-template <typename T>
-void BooleanTensor<T>::bitwise_or(const BooleanTensor *rhs,
-                                  BooleanTensor *ret) const {
-  // ret = x & y
-  bitwise_and(rhs, ret);
-  // ret = x & y ^ x
-  bitwise_xor(ret, ret);
-  // ret = x & y ^ x ^ y
-  rhs->bitwise_xor(ret, ret);
-}
+template<typename T>
+template<template<typename U> class CTensor>
+void BooleanTensor<T>::bitwise_or(const CTensor<T>* rhs,
+                                  BooleanTensor* ret) const {

-template <typename T>
-void BooleanTensor<T>::bitwise_or(const TensorAdapter<T> *rhs,
-                                  BooleanTensor *ret) const {
+    std::vector<std::shared_ptr<TensorAdapter<T>>> tmp;
+
+    for (int i = 0; i < 2; ++i) {
+        tmp.emplace_back(
+            tensor_factory()->template create<T>(shape()));
+    }
+
+    BooleanTensor buffer(tmp[0].get(), tmp[1].get());
    // ret = x & y
-  bitwise_and(rhs, ret);
+    bitwise_and(rhs, &buffer);
    // ret = x & y ^ x
-  bitwise_xor(ret, ret);
+    bitwise_xor(&buffer, &buffer);
    // ret = x & y ^ x ^ y
-  ret->bitwise_xor(rhs, ret);
+    buffer.bitwise_xor(rhs, ret);
 }

-template <typename T>
-void BooleanTensor<T>::bitwise_not(BooleanTensor *ret) const {
+template<typename T>
+void BooleanTensor<T>::bitwise_not(BooleanTensor* ret) const {
    if (party() == 0) {
        share(0)->bitwise_not(ret->share(0));
        share(1)->copy(ret->share(1));
@@ -192,26 +200,27 @@ void BooleanTensor<T>::bitwise_not(BooleanTensor *ret) const {
    }
 }

-template <typename T>
-void BooleanTensor<T>::lshift(size_t rhs, BooleanTensor *ret) const {
+template<typename T>
+void BooleanTensor<T>::lshift(size_t rhs, BooleanTensor* ret) const {
    share(0)->lshift(rhs, ret->share(0));
    share(1)->lshift(rhs, ret->share(1));
 }

-template <typename T>
-void BooleanTensor<T>::rshift(size_t rhs, BooleanTensor *ret) const {
+template<typename T>
+void BooleanTensor<T>::rshift(size_t rhs, BooleanTensor* ret) const {
    share(0)->rshift(rhs, ret->share(0));
    share(1)->rshift(rhs, ret->share(1));
 }

-template <typename T>
-void BooleanTensor<T>::logical_rshift(size_t rhs, BooleanTensor *ret) const {
+template<typename T>
+void BooleanTensor<T>::logical_rshift(size_t rhs, BooleanTensor* ret) const {
    share(0)->logical_rshift(rhs, ret->share(0));
    share(1)->logical_rshift(rhs, ret->share(1));
 }

-template <typename T>
-void BooleanTensor<T>::ppa(const BooleanTensor *rhs, BooleanTensor *ret,
+template<typename T>
+void BooleanTensor<T>::ppa(const BooleanTensor* rhs,
+                           BooleanTensor* ret,
                           size_t n_bits) const {
    // kogge stone adder from tfe
    // https://github.com/tf-encrypted
@@ -219,11 +228,11 @@ void BooleanTensor<T>::ppa(const BooleanTensor *rhs, BooleanTensor *ret,
    const size_t k = std::ceil(std::log2(n_bits));
    std::vector<T> keep_masks(k);
    for (size_t i = 0; i < k; ++i) {
-    keep_masks[i] = (T(1) << (T)std::exp2(i)) - 1;
+        keep_masks[i] = (T(1) << (T) std::exp2(i)) - 1;
    }

    std::shared_ptr<TensorAdapter<T>> tmp[11];
-  for (auto &ti : tmp) {
+    for (auto& ti: tmp) {
        ti = tensor_factory()->template create<T>(ret->shape());
    }
    BooleanTensor<T> g(tmp[0].get(), tmp[1].get());
@@ -245,6 +254,7 @@ void BooleanTensor<T>::ppa(const BooleanTensor *rhs, BooleanTensor *ret,
        g.lshift(std::exp2(i), &g1);
        p.lshift(std::exp2(i), &p1);

+
        p1.bitwise_xor(k_mask, &p1);
        g1.bitwise_and(&p, &c);

@@ -257,12 +267,15 @@ void BooleanTensor<T>::ppa(const BooleanTensor *rhs, BooleanTensor *ret,
    c.bitwise_xor(&p, ret);
 }

-template <typename T, size_t N>
-void a2b(AbstractContext *aby3_ctx, TensorAdapterFactory *tensor_factory,
-         const FixedPointTensor<T, N> *a, BooleanTensor<T> *b, size_t n_bits) {
+template<typename T, size_t N>
+void a2b(AbstractContext* aby3_ctx,
+         TensorAdapterFactory* tensor_factory,
+         const FixedPointTensor<T, N>* a,
+         BooleanTensor<T>* b,
+         size_t n_bits) {

    std::shared_ptr<TensorAdapter<T>> tmp[4];
-  for (auto &ti : tmp) {
+    for (auto& ti: tmp) {
        ti = tensor_factory->template create<T>(a->shape());
        // set 0
        std::transform(ti->data(), ti->data() + ti->numel(), ti->data(),
@@ -305,42 +318,41 @@ void a2b(AbstractContext *aby3_ctx, TensorAdapterFactory *tensor_factory,
    lhs->ppa(rhs.get(), b, n_bits);
 }

-template <typename T>
-template <size_t N>
-BooleanTensor<T> &BooleanTensor<T>::
-operator=(const FixedPointTensor<T, N> *other) {
+template<typename T>
+template<size_t N>
+BooleanTensor<T>& BooleanTensor<T>::operator=(const FixedPointTensor<T, N>* other) {
    a2b(aby3_ctx().get(), tensor_factory().get(), other, this, sizeof(T) * 8);
    return *this;
 }

 template <typename T>
-void tensor_rshift_transform(const TensorAdapter<T> *lhs, size_t rhs,
-                             TensorAdapter<T> *ret) {
-  const T *begin = lhs->data();
+void tensor_rshift_transform(const TensorAdapter<T>* lhs,
+                             size_t rhs, TensorAdapter<T>* ret) {
+    const T* begin = lhs->data();
    std::transform(begin, begin + lhs->numel(), ret->data(),
                   [rhs](T in) { return (in >> rhs) & 1; });
 };

-template <typename T>
-template <size_t N>
-void BooleanTensor<T>::bit_extract(size_t i, const FixedPointTensor<T, N> *in) {
+template<typename T>
+template<size_t N>
+void BooleanTensor<T>::bit_extract(size_t i, const FixedPointTensor<T, N>* in) {
    a2b(aby3_ctx().get(), tensor_factory().get(), in, this, i + 1);

    tensor_rshift_transform(share(0), i, share(0));
    tensor_rshift_transform(share(1), i, share(1));
 }

-template <typename T>
-void BooleanTensor<T>::bit_extract(size_t i, BooleanTensor *ret) const {
+template<typename T>
+void BooleanTensor<T>::bit_extract(size_t i, BooleanTensor* ret) const {
    tensor_rshift_transform(share(0), i, ret->share(0));
    tensor_rshift_transform(share(1), i, ret->share(1));
 }

-template <typename T>
-template <size_t N>
-void BooleanTensor<T>::b2a(FixedPointTensor<T, N> *ret) const {
+template<typename T>
+template<size_t N>
+void BooleanTensor<T>::b2a(FixedPointTensor<T, N>* ret) const {
    std::shared_ptr<TensorAdapter<T>> tmp[2];
-  for (auto &ti : tmp) {
+    for (auto& ti: tmp) {
        ti = tensor_factory()->template create<T>(shape());
        // set 0
        std::transform(ti->data(), ti->data() + ti->numel(), ti->data(),
@@ -364,7 +376,7 @@ void BooleanTensor<T>::b2a(FixedPointTensor<T, N> *ret) const {

    bt.ppa(this, &bt, sizeof(T) * 8);

-  TensorAdapter<T> *dest = nullptr;
+    TensorAdapter<T>* dest = nullptr;
    if (party() == 0) {
        dest =  ret->mutable_share(0);
    }
@@ -381,10 +393,10 @@ void BooleanTensor<T>::b2a(FixedPointTensor<T, N> *ret) const {
    }
 }

-template <typename T>
-template <size_t N>
-void BooleanTensor<T>::mul(const TensorAdapter<T> *rhs,
-                           FixedPointTensor<T, N> *ret,
+template<typename T>
+template<size_t N>
+void BooleanTensor<T>::mul(const TensorAdapter<T>* rhs,
+                           FixedPointTensor<T, N>* ret,
                           size_t rhs_party) const {
    // ot sender
    size_t idx0 = rhs_party;
@@ -396,19 +408,21 @@ void BooleanTensor<T>::mul(const TensorAdapter<T> *rhs,
    auto tmp0 = tensor_factory()->template create<T>(ret->shape());
    auto tmp1 = tensor_factory()->template create<T>(ret->shape());

-  TensorAdapter<T> *tmp[2] = {tmp0.get(), tmp1.get()};
+    TensorAdapter<T>* tmp[2] = {tmp0.get(), tmp1.get()};

-  TensorAdapter<T> *null_arg[2] = {nullptr, nullptr};
+    TensorAdapter<T>* null_arg[2] = {nullptr, nullptr};

    if (party() == idx0) {
        // use ret as buffer
-    TensorAdapter<T> *m[2] = {ret->mutable_share(0), ret->mutable_share(1)};
+        TensorAdapter<T>* m[2] = {ret->mutable_share(0), ret->mutable_share(1)};

        aby3_ctx()->template gen_zero_sharing_arithmetic(*tmp[0]);

        // m0 = a * (b0 ^ b1) + s0
        // m1 = a * (1 ^ b0 ^ b1) + s0
        share(0)->bitwise_xor(share(1), m[0]);
+        std::transform(m[0]->data(), m[0]->data() + m[0]->numel(), m[0]->data(),
+                       [](T in) { return 1 & in; });
        std::transform(m[0]->data(), m[0]->data() + m[0]->numel(), m[1]->data(),
                       [](T in) { return 1 ^ in; });

@@ -419,8 +433,8 @@ void BooleanTensor<T>::mul(const TensorAdapter<T> *rhs,
        m[1]->add(tmp[0], m[1]);

        aby3_ctx()->template ot(idx0, idx1, idx2, null_arg[0],
-                            const_cast<const aby3::TensorAdapter<T> **>(m), tmp,
-                            null_arg[0]);
+                                const_cast<const aby3::TensorAdapter<T>**>(m),
+                                tmp, null_arg[0]);

        // ret0 = s2
        // ret1 = s1
@@ -431,20 +445,18 @@ void BooleanTensor<T>::mul(const TensorAdapter<T> *rhs,
        // ret0 = s1
        aby3_ctx()->template gen_zero_sharing_arithmetic(*(ret->mutable_share(0)));
        // ret1 = a * b + s0
-    aby3_ctx()->template ot(
-        idx0, idx1, idx2, share(1),
-        const_cast<const aby3::TensorAdapter<T> **>(null_arg), tmp,
-        ret->mutable_share(1));
+        aby3_ctx()->template ot(idx0, idx1, idx2, share(1),
+                                const_cast<const aby3::TensorAdapter<T>**>(null_arg),
+                                tmp, ret->mutable_share(1));
        aby3_ctx()->network()->template send(idx0, *(ret->share(0)));
        aby3_ctx()->network()->template send(idx2, *(ret->share(1)));
    } else if (party() == idx2) {
        // ret0 = a * b + s0
        aby3_ctx()->template gen_zero_sharing_arithmetic(*(ret->mutable_share(1)));
        // ret1 = s2
-    aby3_ctx()->template ot(
-        idx0, idx1, idx2, share(0),
-        const_cast<const aby3::TensorAdapter<T> **>(null_arg), tmp,
-        null_arg[0]);
+        aby3_ctx()->template ot(idx0, idx1, idx2, share(0),
+                                const_cast<const aby3::TensorAdapter<T>**>(null_arg),
+                                tmp, null_arg[0]);

        aby3_ctx()->network()->template send(idx0, *(ret->share(1)));

@@ -452,31 +464,68 @@ void BooleanTensor<T>::mul(const TensorAdapter<T> *rhs,
    }
 }

-template <typename T>
-template <size_t N>
-void BooleanTensor<T>::mul(const FixedPointTensor<T, N> *rhs,
-                           FixedPointTensor<T, N> *ret) const {
-  auto tmp0 = tensor_factory()->template create<T>(ret->shape());
-  auto tmp1 = tensor_factory()->template create<T>(ret->shape());
-  auto tmp2 = tensor_factory()->template create<T>(ret->shape());
+template<typename T>
+template<size_t N>
+void BooleanTensor<T>::mul(const FixedPointTensor<T, N>* rhs,
+                           FixedPointTensor<T, N>* ret) const {
+    std::vector<std::shared_ptr<TensorAdapter<T>>> tmp;
+
+    for (int i = 0; i < 4; ++i) {
+        tmp.emplace_back(
+            tensor_factory()->template create<T>(ret->shape()));
+    }

-  FixedPointTensor<T, N> tmp(tmp0.get(), tmp1.get());
+    FixedPointTensor<T, N> tmp0(tmp[0].get(), tmp[1].get());
+    FixedPointTensor<T, N> tmp1(tmp[2].get(), tmp[3].get());

    if (party() == 0) {
-    mul(nullptr, ret, 1);
-    mul(rhs->share(0), &tmp, 0);
-    ret->add(&tmp, ret);
-
+        mul(nullptr, &tmp0, 1);
+        mul(rhs->share(0), &tmp1, 0);
    } else if (party() == 1) {
-    rhs->share(0)->add(rhs->share(1), tmp2.get());
-    mul(tmp2.get(), ret, 1);
-    mul(nullptr, &tmp, 0);
-    ret->add(&tmp, ret);
-
+        rhs->share(0)->add(rhs->share(1), tmp[2].get());
+        mul(tmp[2].get(), &tmp0, 1);
+        mul(nullptr, &tmp1, 0);
    } else { // party() == 2
-    mul(nullptr, ret, 1);
-    mul(nullptr, &tmp, 0);
-    ret->add(&tmp, ret);
+        mul(nullptr, &tmp0, 1);
+        mul(nullptr, &tmp1, 0);
+    }
+    tmp0.add(&tmp1, ret);
+}
+template<typename T>
+void BooleanTensor<T>::onehot_from_cmp() {
+    // cmp is done slice by slice
+    // suppose that shape = [k, m, n, ...]
+    // shape of all slices and tmp tensors = [1, m, n]
+    auto shape_ = shape();
+    size_t len = shape_[0];
+    shape_[0] = 1;
+    std::vector<std::shared_ptr<TensorAdapter<T>>> tmp;
+
+    for (int i = 0; i < 4; ++i) {
+        tmp.emplace_back(
+            tensor_factory()->template create<T>(shape_));
+    }
+
+    tmp.emplace_back(tensor_factory()->template create<T>());
+    tmp.emplace_back(tensor_factory()->template create<T>());
+
+    BooleanTensor found(tmp[0].get(), tmp[1].get());
+
+    assign_to_tensor(tmp[0].get(), T(0));
+    assign_to_tensor(tmp[1].get(), T(0));
+
+    BooleanTensor not_found(tmp[2].get(), tmp[3].get());
+
+    // res[i] = !found & input[i]
+    // found = found 1 res[i]
+    // to find last 1, we search backward
+    for (size_t i = len; i > 0; --i) {
+        share(0)->slice(i - 1, i, tmp[4].get());
+        share(1)->slice(i - 1, i, tmp[5].get());
+        BooleanTensor cmp_i(tmp[4].get(), tmp[5].get());
+        found.bitwise_not(&not_found);
+        not_found.bitwise_and(&cmp_i, &cmp_i);
+        cmp_i.bitwise_or(&found, &found);
    }
 }
 } // namespace aby3
--- a/core/privc3/boolean_tensor_test.cc
+++ b/core/privc3/boolean_tensor_test.cc
@@ -1215,9 +1215,9 @@ TEST_F(BooleanTensorTest, abmul_test) {
                                                        gen1(), gen1(), gen1()};

    // lhs = 1
-    sl[0]->data()[0] = 1;
-    sl[1]->data()[0] = 0;
-    sl[2]->data()[0] = 0;
+    sl[0]->data()[0] = -1;
+    sl[1]->data()[0] = -3;
+    sl[2]->data()[0] = 3;

    BTensor b0(sl[0].get(), sl[1].get());
    BTensor b1(sl[1].get(), sl[2].get());
@@ -1274,9 +1274,9 @@ TEST_F(BooleanTensorTest, abmul2_test) {
                                                        gen1(), gen1(), gen1()};

    // lhs = 1
-    sl[0]->data()[0] = 1;
-    sl[1]->data()[0] = 0;
-    sl[2]->data()[0] = 0;
+    sl[0]->data()[0] = -3;
+    sl[1]->data()[0] = -1;
+    sl[2]->data()[0] = 3;

    // rhs = 12 = 3 + 4 + 5
    sr[0]->data()[0] = 3;
@@ -1331,4 +1331,197 @@ TEST_F(BooleanTensorTest, abmul2_test) {
    }
    EXPECT_EQ(1 * 12, p->data()[0]);
 }
+
+TEST_F(BooleanTensorTest, abmul3_test) {
+    std::shared_ptr<TensorAdapter<int64_t>> sl[3] = { gen1(), gen1(), gen1() };
+    std::shared_ptr<TensorAdapter<int64_t>> sr[3] = { gen1(), gen1(), gen1() };
+    std::shared_ptr<TensorAdapter<int64_t>> sout[6] = { gen1(), gen1(), gen1(),
+                                                        gen1(), gen1(), gen1()};
+
+
+    // lhs = 0
+    sl[0]->data()[0] = 373964488827046757;
+    sl[1]->data()[0] = -2697357730885869060;
+    sl[2]->data()[0] = -2332413979122373991;
+
+    // rhs = -1
+    sr[0]->data()[0] = 8388121746490115866;
+    sr[1]->data()[0] = 5851959018403668595;
+    sr[2]->data()[0] = 4206663308815767154;
+
+    BTensor bl0(sl[0].get(), sl[1].get());
+    BTensor bl1(sl[1].get(), sl[2].get());
+    BTensor bl2(sl[2].get(), sl[0].get());
+
+    FTensor fr0(sr[0].get(), sr[1].get());
+    FTensor fr1(sr[1].get(), sr[2].get());
+    FTensor fr2(sr[2].get(), sr[0].get());
+
+    FTensor fout0(sout[0].get(), sout[1].get());
+    FTensor fout1(sout[2].get(), sout[3].get());
+    FTensor fout2(sout[4].get(), sout[5].get());
+
+    auto p = gen1();
+
+    _t[0] = std::thread(
+        [&] () {
+        ContextHolder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[0], [&](){
+                bl0.mul(&fr0, &fout0);
+                fout0.reveal_to_one(0, p.get());
+            });
+        }
+    );
+
+    _t[1] = std::thread(
+        [&] () {
+        ContextHolder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[1], [&](){
+                bl1.mul(&fr1, &fout1);
+                fout1.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+
+    _t[2] = std::thread(
+        [&] () {
+        ContextHolder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[2], [&](){
+                bl2.mul(&fr2, &fout2);
+                fout2.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+    for (auto &t: _t) {
+        t.join();
+    }
+    EXPECT_EQ(0, p->data()[0]);
+}
+
+TEST_F(BooleanTensorTest, abmul4_test) {
+    std::shared_ptr<TensorAdapter<int64_t>> sl[3] = { gen1(), gen1(), gen1() };
+    std::shared_ptr<TensorAdapter<int64_t>> sr[3] = { gen1(), gen1(), gen1() };
+    std::shared_ptr<TensorAdapter<int64_t>> sout[6] = { gen1(), gen1(), gen1(),
+                                                        gen1(), gen1(), gen1()};
+
+
+    // lhs = 1
+    sl[0]->data()[0] = 373964488827046757;
+    sl[1]->data()[0] = -2697357730885869060;
+    sl[2]->data()[0] = -2332413979122373992;
+
+    // rhs = -1
+    sr[0]->data()[0] = 8388121746490115866;
+    sr[1]->data()[0] = 5851959018403668595;
+    sr[2]->data()[0] = 4206663308815767154;
+
+    BTensor bl0(sl[0].get(), sl[1].get());
+    BTensor bl1(sl[1].get(), sl[2].get());
+    BTensor bl2(sl[2].get(), sl[0].get());
+
+    FTensor fr0(sr[0].get(), sr[1].get());
+    FTensor fr1(sr[1].get(), sr[2].get());
+    FTensor fr2(sr[2].get(), sr[0].get());
+
+    FTensor fout0(sout[0].get(), sout[1].get());
+    FTensor fout1(sout[2].get(), sout[3].get());
+    FTensor fout2(sout[4].get(), sout[5].get());
+
+    auto p = gen1();
+
+    _t[0] = std::thread(
+        [&] () {
+        ContextHolder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[0], [&](){
+                bl0.mul(&fr0, &fout0);
+                fout0.reveal_to_one(0, p.get());
+            });
+        }
+    );
+
+    _t[1] = std::thread(
+        [&] () {
+        ContextHolder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[1], [&](){
+                bl1.mul(&fr1, &fout1);
+                fout1.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+
+    _t[2] = std::thread(
+        [&] () {
+        ContextHolder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[2], [&](){
+                bl2.mul(&fr2, &fout2);
+                fout2.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+    for (auto &t: _t) {
+        t.join();
+    }
+    EXPECT_EQ(-1, p->data()[0]);
+}
+
+TEST_F(BooleanTensorTest, onehot_from_cmp_test) {
+    std::vector<size_t> shape = {4, 1};
+    std::shared_ptr<TensorAdapter<int64_t>> sout[6] =
+    { gen(shape), gen(shape), gen(shape), gen(shape), gen(shape), gen(shape)};
+
+    for (auto& ptr: sout) {
+        assign_to_tensor(ptr.get(), 0l);
+    }
+
+    sout[0].get()->data()[0] = 1;
+    sout[0].get()->data()[2] = 1;
+
+    sout[5].get()->data()[0] = 1;
+    sout[5].get()->data()[2] = 1;
+
+    // input plaintext [1010]
+
+    BTensor bout0(sout[0].get(), sout[1].get());
+    BTensor bout1(sout[2].get(), sout[3].get());
+    BTensor bout2(sout[4].get(), sout[5].get());
+
+    auto p = gen(shape);
+
+    _t[0] = std::thread(
+        [&] () {
+        ContextHolder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[0], [&](){
+                bout0.onehot_from_cmp();
+                bout0.reveal_to_one(0, p.get());
+            });
+        }
+    );
+
+    _t[1] = std::thread(
+        [&] () {
+        ContextHolder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[1], [&](){
+                bout1.onehot_from_cmp();
+                bout1.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+
+    _t[2] = std::thread(
+        [&] () {
+        ContextHolder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[2], [&](){
+                bout2.onehot_from_cmp();
+                bout2.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+    for (auto &t: _t) {
+        t.join();
+    }
+    EXPECT_EQ(0, p->data()[0]);
+    EXPECT_EQ(0, p->data()[1]);
+    EXPECT_EQ(1, p->data()[2]);
+    EXPECT_EQ(0, p->data()[3]);
+}
 } // namespace aby3
--- a/core/privc3/fixedpoint_tensor.h
+++ b/core/privc3/fixedpoint_tensor.h
@@ -20,123 +20,178 @@
 #include "aby3_context.h"
 #include "core/paddlefl_mpc/mpc_protocol/context_holder.h"
 #include "paddle_tensor.h"
+#include "boolean_tensor.h"
+#include "core/paddlefl_mpc/mpc_protocol/context_holder.h"

 namespace aby3 {

-template <typename T, size_t N> class FixedPointTensor {
+template<typename T, size_t N>
+class FixedPointTensor {

 public:
-  explicit FixedPointTensor(TensorAdapter<T> *share_tensor[2]);
+    explicit FixedPointTensor(TensorAdapter<T>* share_tensor[2]);

-  explicit FixedPointTensor(TensorAdapter<T> *share_tensor_0,
-                            TensorAdapter<T> *share_tensor_1);
+    explicit FixedPointTensor(TensorAdapter<T>* share_tensor_0,
+                              TensorAdapter<T>* share_tensor_1);

-  ~FixedPointTensor(){};
+    ~FixedPointTensor() {};

-  // get mutable shape of tensor
-  TensorAdapter<T> *mutable_share(size_t idx);
+    //get mutable shape of tensor
+    TensorAdapter<T>* mutable_share(size_t idx);

-  const TensorAdapter<T> *share(size_t idx) const;
+    const TensorAdapter<T>* share(size_t idx) const;

-  size_t numel() const { return _share[0]->numel(); }
+    size_t numel() const {
+        return _share[0]->numel();
+    }

    // reveal fixedpointtensor to one party
-  void reveal_to_one(size_t party, TensorAdapter<T> *ret) const;
+    void reveal_to_one(size_t party, TensorAdapter<T>* ret) const;

    // reveal fixedpointtensor to all parties
-  void reveal(TensorAdapter<T> *ret) const;
+    void reveal(TensorAdapter<T>* ret) const;

    const std::vector<size_t> shape() const;

-  // convert TensorAdapter to shares
-  static void share(const TensorAdapter<T> *input,
-                    TensorAdapter<T> *output_shares[3],
+    //convert TensorAdapter to shares
+    static void share(const TensorAdapter<T>* input,
+                      TensorAdapter<T>* output_shares[3],
                      block seed = g_zero_block);

    // element-wise add with FixedPointTensor
-  void add(const FixedPointTensor *rhs, FixedPointTensor *ret) const;
+    void add(const FixedPointTensor* rhs, FixedPointTensor* ret) const;

    // element-wise add with TensorAdapter

-  void add(const TensorAdapter<T> *rhs, FixedPointTensor *ret) const;
+    void add(const TensorAdapter<T>* rhs, FixedPointTensor* ret) const;

    // element-wise sub with FixedPointTensor
-  void sub(const FixedPointTensor *rhs, FixedPointTensor *ret) const;
+    void sub(const FixedPointTensor* rhs, FixedPointTensor* ret) const;

    // element-wise sub with TensorAdapter
-  void sub(const TensorAdapter<T> *rhs, FixedPointTensor *ret) const;
+    void sub(const TensorAdapter<T>* rhs, FixedPointTensor* ret) const;

    // negative
-  void negative(FixedPointTensor *ret) const;
+    void negative(FixedPointTensor* ret) const;

    // element-wise mul with FixedPointTensor using truncate1
-  void mul(const FixedPointTensor *rhs, FixedPointTensor *ret) const;
+    void mul(const FixedPointTensor* rhs, FixedPointTensor* ret) const;

    // element-wise mul with TensorAdapter
-  void mul(const TensorAdapter<T> *rhs, FixedPointTensor *ret) const;
+    void mul(const TensorAdapter<T>* rhs, FixedPointTensor* ret) const;

    // div by TensorAdapter
-  void div(const TensorAdapter<T> *rhs, FixedPointTensor *ret) const;
+    void div(const TensorAdapter<T>* rhs, FixedPointTensor* ret) const;
+
+    // div by FixedPointedTensor
+    // TODO@yqy : not surport operator rhs <= 0 now
+    void div(const FixedPointTensor* rhs, FixedPointTensor* ret,
+             size_t iter = 16, double x0 = pow(2, -15)) const;
+
+    // long div by boolean circuit
+    // res_int_len: estimated bit len of the integer part of result
+    void long_div(const FixedPointTensor* rhs,
+                  FixedPointTensor* ret, size_t res_int_len = 20) const;

-  // element-wise mul, use trunc2
-  void mul2(const FixedPointTensor *rhs, FixedPointTensor *ret) const;
+    void inverse_square_root(FixedPointTensor* ret,
+                             size_t iter = 16, double x0 = 0x1p-10) const;

    // dot_mul
-  template <template <typename U, size_t...> class CTensor, size_t... N1>
-  void dot_mul(const CTensor<T, N1...> *rhs, FixedPointTensor *ret) const;
+    template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+    void dot_mul(const CTensor<T, N1...>* rhs, FixedPointTensor* ret) const;

-  // sum all element
-  void sum(FixedPointTensor *ret) const;
+    //sum all element
+    void sum(FixedPointTensor* ret) const;

    // mat_mul with FixedPointTensor
-  void mat_mul(const FixedPointTensor *rhs, FixedPointTensor *ret) const;
+    void mat_mul(const FixedPointTensor* rhs, FixedPointTensor* ret) const;

    // mat_mul with TensorAdapter
-  void mat_mul(const TensorAdapter<T> *rhs, FixedPointTensor *ret) const;
+    void mat_mul(const TensorAdapter<T>* rhs, FixedPointTensor* ret) const;

-  void exp(FixedPointTensor *ret, size_t iter = 8) const;
+    // exp approximate: exp(x) = \lim_{n->inf} (1+x/n)^n
+    // where n = 2^ite
+    void exp(FixedPointTensor* ret, size_t iter = 8) const;

    // element-wise relu
-  void relu(FixedPointTensor *ret) const;
+    void relu(FixedPointTensor* ret) const;

-  // element-wise sigmoid
-  void sigmoid(FixedPointTensor *ret) const;
+    // element-wise relu with relu'
+    void relu_with_derivative(FixedPointTensor* ret, BooleanTensor<T>* derivative) const;
+
+    // element-wise sigmoid using 3 piecewise polynomials
+    void sigmoid(FixedPointTensor* ret) const;
+
+    // element-wise sigmoid using 5 pieces polynomial
+    // see paper [Privacy-preserving collaborative machine learning
+    //            on genomic data using TensorFlow]
+    void sigmoid_enhanced(FixedPointTensor* ret) const;
+
+    // element-wise sigmoid using Chebyshev polynomial approximation
+    // implemented with ref to tfe[https://github.com/tf-encrypted/tf-encrypted]
+    void sigmoid_chebyshev(FixedPointTensor* ret) const;

    // softmax axis = -1
-  void softmax(FixedPointTensor *ret) const;
+    void softmax(FixedPointTensor* ret,
+                 bool use_relu = false,
+                 bool use_long_div = true) const;

    // element-wise polynomial
-  void polynomial(const TensorAdapter<T> *coeff, FixedPointTensor *ret) const;
+    void polynomial(const TensorAdapter<T>* coeff,
+                    FixedPointTensor* ret) const;

    // element-wise piecewise polynomial
-  void polynomial_piecewise(const TensorAdapter<T> *coeff,
-                            const TensorAdapter<T> *break_point,
-                            FixedPointTensor *ret) const;
+    void polynomial_piecewise(
+                const TensorAdapter<T>* coeff,
+                const TensorAdapter<T>* break_point,
+                FixedPointTensor* ret) const;

    // element-wise compare
    // <
-  template <template <typename U, size_t...> class CTensor, size_t... N1>
-  void lt(const CTensor<T, N1...> *rhs, BooleanTensor<T> *ret) const;
+    template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+    void lt(const CTensor<T, N1...>* rhs, BooleanTensor<T>* ret) const;

    // <=
-  template <template <typename U, size_t...> class CTensor, size_t... N1>
-  void leq(const CTensor<T, N1...> *rhs, BooleanTensor<T> *ret) const;
+    template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+    void leq(const CTensor<T, N1...>* rhs, BooleanTensor<T>* ret) const;

    // >
-  template <template <typename U, size_t...> class CTensor, size_t... N1>
-  void gt(const CTensor<T, N1...> *rhs, BooleanTensor<T> *ret) const;
+    template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+    void gt(const CTensor<T, N1...>* rhs, BooleanTensor<T>* ret) const;

    // >=
-  template <template <typename U, size_t...> class CTensor, size_t... N1>
-  void geq(const CTensor<T, N1...> *rhs, BooleanTensor<T> *ret) const;
+    template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+    void geq(const CTensor<T, N1...>* rhs, BooleanTensor<T>* ret) const;

    // ==
-  template <template <typename U, size_t...> class CTensor, size_t... N1>
-  void eq(const CTensor<T, N1...> *rhs, BooleanTensor<T> *ret) const;
+    template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+    void eq(const CTensor<T, N1...>* rhs, BooleanTensor<T>* ret) const;

    // !=
-  template <template <typename U, size_t...> class CTensor, size_t... N1>
-  void neq(const CTensor<T, N1...> *rhs, BooleanTensor<T> *ret) const;
+    template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+    void neq(const CTensor<T, N1...>* rhs, BooleanTensor<T>* ret) const;
+
+    // element-wise max
+    // if not null, cmp stores true if rhs is bigger
+    template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+    void max(const CTensor<T, N1...>* rhs,
+             FixedPointTensor* ret,
+             BooleanTensor<T>* cmp = nullptr) const;
+
+    // for tensor with shape like [k, n, m, ...]
+    // ret shape is [1, n, m, ...], in which every element is largest of k elements
+    // pos shape is [k, n, m, ...], each col of pos is an one-hot tensor
+    // which indicating the max element's position
+    void max_pooling(FixedPointTensor* ret,
+                     BooleanTensor<T>* pos = nullptr) const;

 private:
    static inline std::shared_ptr<AbstractContext> aby3_ctx() {
@@ -147,21 +202,38 @@ private:
        return paddle::mpc::ContextHolder::tensor_factory();
    }

-  static void truncate1(FixedPointTensor *op, FixedPointTensor *ret,
+    static void truncate(const FixedPointTensor* op, FixedPointTensor* ret,
+                          size_t scaling_factor);
+
+    template<typename MulFunc>
+    static void mul_trunc(const FixedPointTensor<T, N>* lhs,
+                          const FixedPointTensor<T, N>* rhs,
+                          FixedPointTensor<T, N>* ret,
+                          MulFunc mul_func);
+
+    // truncate3 protocol can avoid losing msb error when truncate
+    // with acceptable security compromise
+    static void truncate3(const FixedPointTensor* op, FixedPointTensor* ret,
                          size_t scaling_factor);

    // reduce last dim
-  static void reduce(FixedPointTensor<T, N> *input,
-                     FixedPointTensor<T, N> *ret);
+    static void reduce(FixedPointTensor<T, N>* input,
+                       FixedPointTensor<T, N>* ret);

-  static size_t party() { return aby3_ctx()->party(); }
+    static size_t party() {
+        return aby3_ctx()->party();
+    }

-  static size_t pre_party() { return aby3_ctx()->pre_party(); }
+    static size_t pre_party() {
+        return aby3_ctx()->pre_party();
+    }

-  static size_t next_party() { return aby3_ctx()->next_party(); }
+    static size_t next_party() {
+        return aby3_ctx()->next_party();
+    }

-  static void reshare(const TensorAdapter<T> *send_val,
-                      TensorAdapter<T> *recv_val) {
+    static void reshare(const TensorAdapter<T>* send_val,
+                 TensorAdapter<T>* recv_val) {
        if (party() == 0) {
            aby3_ctx()->network()->template recv(next_party(), *recv_val);
            aby3_ctx()->network()->template send(pre_party(), *send_val);
@@ -171,9 +243,17 @@ private:
        }
    }

-  TensorAdapter<T> *_share[2];
+    static void reciprocal(const FixedPointTensor* op, FixedPointTensor* ret,
+                                                      size_t iter, double x0);
+
+    static void inverse_square_root(const FixedPointTensor* op,
+                                    FixedPointTensor* ret,
+                                    size_t iter, double x0);
+
+    TensorAdapter<T>* _share[2];
+
 };

-} // namespace aby3
+} //namespace aby3

 #include "fixedpoint_tensor_imp.h"
--- a/core/privc3/fixedpoint_tensor_imp.h
+++ b/core/privc3/fixedpoint_tensor_imp.h
@@ -14,45 +14,45 @@

 #pragma once

-#include <algorithm>
 #include <memory>
+#include <algorithm>

 #include "paddle/fluid/platform/enforce.h"
 #include "prng.h"

 namespace aby3 {

-template <typename T, size_t N>
-FixedPointTensor<T, N>::FixedPointTensor(TensorAdapter<T> *share_tensor[2]) {
+template<typename T, size_t N>
+FixedPointTensor<T, N>::FixedPointTensor(TensorAdapter<T>* share_tensor[2]) {
    // TODO: check tensors' shapes
    _share[0] = share_tensor[0];
    _share[1] = share_tensor[1];
 }

-template <typename T, size_t N>
-FixedPointTensor<T, N>::FixedPointTensor(TensorAdapter<T> *share_tensor_0,
-                                         TensorAdapter<T> *share_tensor_1) {
+template<typename T, size_t N>
+FixedPointTensor<T, N>::FixedPointTensor(TensorAdapter<T>* share_tensor_0,
+                                         TensorAdapter<T>* share_tensor_1) {
    // TODO: check tensors' shapes
    _share[0] = share_tensor_0;
    _share[1] = share_tensor_1;
 }

-template <typename T, size_t N>
-TensorAdapter<T> *FixedPointTensor<T, N>::mutable_share(size_t idx) {
+template<typename T, size_t N>
+TensorAdapter<T>* FixedPointTensor<T, N>::mutable_share(size_t idx) {
    PADDLE_ENFORCE_LT(idx, 2, "Input should be less than 2.");
    return _share[idx];
 }

-template <typename T, size_t N>
-const TensorAdapter<T> *FixedPointTensor<T, N>::share(size_t idx) const {
+template<typename T, size_t N>
+const TensorAdapter<T>* FixedPointTensor<T, N>::share(size_t idx) const {
    PADDLE_ENFORCE_LT(idx, 2, "Input should be less than 2.");
    return _share[idx];
 }

 // reveal fixedpointtensor to one party
-template <typename T, size_t N>
+template<typename T, size_t N>
 void FixedPointTensor<T, N>::reveal_to_one(size_t party,
-                                           TensorAdapter<T> *ret) const {
+                                           TensorAdapter<T>* ret) const {

    if (party == this->party()) {
        // TODO: check if tensor shape equal
@@ -71,28 +71,28 @@ void FixedPointTensor<T, N>::reveal_to_one(size_t party,
 }

 // reveal fixedpointtensor to all parties
-template <typename T, size_t N>
-void FixedPointTensor<T, N>::reveal(TensorAdapter<T> *ret) const {
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::reveal(TensorAdapter<T>* ret) const {
    for (size_t i = 0; i < 3; ++i) {
        reveal_to_one(i, ret);
    }
 }

-template <typename T, size_t N>
+template<typename T, size_t N>
 const std::vector<size_t> FixedPointTensor<T, N>::shape() const {
    return _share[0]->shape();
 }

-// convert TensorAdapter to shares
-template <typename T, size_t N>
-void FixedPointTensor<T, N>::share(const TensorAdapter<T> *input,
-                                   TensorAdapter<T> *output_shares[3],
+//convert TensorAdapter to shares
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::share(const TensorAdapter<T>* input,
+                                    TensorAdapter<T>* output_shares[3],
                                    block seed) {

    if (equals(seed, g_zero_block)) {
        seed = block_from_dev_urandom();
    }
-  // set seed of prng[2]
+    //set seed of prng[2]
    aby3_ctx()->set_random_seed(seed, 2);

    aby3_ctx()->template gen_random_private(*output_shares[0]);
@@ -106,17 +106,18 @@ void FixedPointTensor<T, N>::share(const TensorAdapter<T> *input,
    }
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::add(const FixedPointTensor<T, N> *rhs,
-                                 FixedPointTensor<T, N> *ret) const {
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::add(const FixedPointTensor<T, N>* rhs,
+                                FixedPointTensor<T, N>* ret) const {
    _share[0]->add(rhs->_share[0], ret->_share[0]);
    _share[1]->add(rhs->_share[1], ret->_share[1]);
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::add(const TensorAdapter<T> *rhs,
-                                 FixedPointTensor<T, N> *ret) const {
-  PADDLE_ENFORCE_EQ(N, rhs->scaling_factor(), "no match scaling factor");
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::add(const TensorAdapter<T>* rhs,
+                                FixedPointTensor<T, N>* ret) const {
+    PADDLE_ENFORCE_EQ(N, rhs->scaling_factor(),
+                        "no match scaling factor");
    if (party() == 0) {
        _share[0]->add(rhs, ret->_share[0]);
        _share[1]->copy(ret->_share[1]);
@@ -129,17 +130,18 @@ void FixedPointTensor<T, N>::add(const TensorAdapter<T> *rhs,
    }
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::sub(const FixedPointTensor<T, N> *rhs,
-                                 FixedPointTensor<T, N> *ret) const {
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::sub(const FixedPointTensor<T, N>* rhs,
+                                FixedPointTensor<T, N>* ret) const {
    _share[0]->sub(rhs->_share[0], ret->_share[0]);
    _share[1]->sub(rhs->_share[1], ret->_share[1]);
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::sub(const TensorAdapter<T> *rhs,
-                                 FixedPointTensor<T, N> *ret) const {
-  PADDLE_ENFORCE_EQ(N, rhs->scaling_factor(), "no match scaling factor");
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::sub(const TensorAdapter<T>* rhs,
+                                FixedPointTensor<T, N>* ret) const {
+    PADDLE_ENFORCE_EQ(N, rhs->scaling_factor(),
+                        "no match scaling factor");
    if (party() == 0) {
        _share[0]->sub(rhs, ret->_share[0]);
        _share[1]->copy(ret->_share[1]);
@@ -152,51 +154,26 @@ void FixedPointTensor<T, N>::sub(const TensorAdapter<T> *rhs,
    }
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::negative(FixedPointTensor<T, N> *ret) const {
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::negative(FixedPointTensor<T, N>* ret) const {
    _share[0]->negative(ret->_share[0]);
    _share[1]->negative(ret->_share[1]);
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::mul(const FixedPointTensor<T, N> *rhs,
-                                 FixedPointTensor<T, N> *ret) const {
-
-  auto r_zero = tensor_factory()->template create<T>(this->shape());
-  aby3_ctx()->gen_zero_sharing_arithmetic(*r_zero.get());
-
-  // temp = _share[0] * rhs->_share[0] +
-  //        _share[0] * rhs->_share[1] +
-  //        _share[1] * rhs->_share[0] +
-  //        r_zero
-  auto temp = tensor_factory()->template create<T>(this->shape());
-  auto temp1 = tensor_factory()->template create<T>(this->shape());
-
-  _share[0]->mul(rhs->_share[0], temp.get());
-  _share[0]->mul(rhs->_share[1], temp1.get());
-  temp1->add(temp.get(), temp1.get());
-
-  _share[1]->mul(rhs->_share[0], temp.get());
-  temp1->add(r_zero.get(), temp1.get());
-  temp->add(temp1.get(), temp.get());
-
-  auto temp2 = tensor_factory()->template create<T>(this->shape());
-  auto temp3 = tensor_factory()->template create<T>(this->shape());
-
-  TensorAdapter<int64_t> *temp_array[2] = {temp2.get(), temp3.get()};
-
-  std::shared_ptr<FixedPointTensor<T, N>> ret_no_trunc =
-      std::make_shared<FixedPointTensor<T, N>>(temp_array);
-  temp->copy(ret_no_trunc->_share[0]);
-  reshare(temp.get(), ret_no_trunc->_share[1]);
-
-  truncate1(ret_no_trunc.get(), ret, N);
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::mul(const FixedPointTensor<T, N>* rhs,
+                                 FixedPointTensor<T, N>* ret) const {
+    mul_trunc(this, rhs, ret, &TensorAdapter<T>::mul);
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::truncate1(FixedPointTensor<T, N> *op,
-                                       FixedPointTensor<T, N> *ret,
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::truncate(const FixedPointTensor<T, N>* op,
+                                       FixedPointTensor<T, N>* ret,
                                       size_t scaling_factor) {
+    if (scaling_factor == 0) {
+        op->share(0)->copy(ret->mutable_share(0));
+        op->share(1)->copy(ret->mutable_share(1));
+    }
    // implement ABY3's truncate1 algorithm
    if (party() == 0) {
        // party0
@@ -209,7 +186,10 @@ void FixedPointTensor<T, N>::truncate1(FixedPointTensor<T, N> *op,
        aby3_ctx()->template gen_random(*r_12.get(), true);

        op->_share[0]->add(op->_share[1], ret->_share[0]);
+        // trunc from [SecureML, Thm.1]
+        ret->_share[0]->negative(ret->_share[0]);
        ret->_share[0]->rshift(scaling_factor, ret->_share[0]);
+        ret->_share[0]->negative(ret->_share[0]);
        ret->_share[0]->sub(r_12.get(), ret->_share[0]);

        aby3_ctx()->network()->template send(0, *(ret->_share[0]));
@@ -224,68 +204,153 @@ void FixedPointTensor<T, N>::truncate1(FixedPointTensor<T, N> *op,

        r_21->copy(ret->_share[0]);
    }
+
    return;
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::mul2(const FixedPointTensor<T, N> *rhs,
-                                  FixedPointTensor<T, N> *ret) const {
-
-  // element-wise mul implemented by ABY3's truncate2 algorithm
+// Protocol. `truncate3`
+// P2 randomly generates r' \in (-2^62, 2^62), randomly generates r'_0, r_0, r_1 in Z_{2^64},
+// P2 compute r'_1 = r' - r'_0, r_2 = r'/2^N - r_0 - r_1, let x2 = r_2
+// P2 send r_0, r'_0 to P0, send r_1, r'_1 to P1
+// P1 and P0 execute "reveal x - r' to P1"
+// P1 compute x1 = (x - r') / 2^N + r_1
+// P0 set x0 = r_0
+// P0, P1, P2 invoke reshare() with inputs x0, x1, x2 respectively.
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::truncate3(const FixedPointTensor<T, N>* op,
+                                       FixedPointTensor<T, N>* ret,
+                                       size_t scaling_factor) {
+    if (scaling_factor == 0) {
+        op->share(0)->copy(ret->mutable_share(0));
+        op->share(1)->copy(ret->mutable_share(1));
+        return;
+    }
    std::vector<std::shared_ptr<TensorAdapter<T>>> temp;
-  for (int i = 0; i < 12; ++i) {
-    temp.emplace_back(tensor_factory()->template create<T>(this->shape()));
-  }
-
-  // gen boolean random share
-  aby3_ctx()->template gen_random(*temp[0], 0);
-  aby3_ctx()->template gen_random(*temp[1], 1);
-
-  std::shared_ptr<BooleanTensor<T>> r =
-      std::make_shared<BooleanTensor<T>>(temp[0].get(), temp[1].get());
-  std::shared_ptr<BooleanTensor<T>> r_integer =
-      std::make_shared<BooleanTensor<T>>(temp[4].get(), temp[5].get());
-  r->rshift(N, r_integer.get());
-
-  std::shared_ptr<FixedPointTensor<T, N>> r_fixed =
-      std::make_shared<FixedPointTensor<T, N>>(temp[6].get(), temp[7].get());
-  std::shared_ptr<FixedPointTensor<T, N>> r_integer_fixed =
-      std::make_shared<FixedPointTensor<T, N>>(temp[8].get(), temp[9].get());
-  r->b2a(r_fixed.get());
-
-  // r'
-  r_integer->b2a(r_integer_fixed.get());
-
-  // r_zero = gen_zero_share(_shape[0]->shape)
-  auto r_zero = tensor_factory()->template create<T>(this->shape());
-  aby3_ctx()->template gen_zero_sharing_arithmetic(*r_zero);
-
-  // temp[10] = _share[0] * rhs->_share[0] +
-  //        _share[0] * rhs->_share[1] +
-  //        _share[1] * rhs->_share[0] +
-  //        r_zero - r[0]
-  _share[0]->mul(rhs->_share[0], temp[11].get());
-  _share[0]->mul(rhs->_share[1], temp[10].get());
-  temp[11]->add(temp[10].get(), temp[11].get());
-  _share[1]->mul(rhs->_share[0], temp[10].get());
-  temp[11]->add(temp[10].get(), temp[11].get());
-  r_zero->sub(r_fixed->_share[0], temp[10].get());
-  temp[10]->add(temp[11].get(), temp[10].get());
-
-  // ret = reshare
-  temp[10]->copy(ret->_share[0]);
-  reshare(temp[10].get(), ret->_share[1]);
-
-  // ret = reconstruct(ret).rshift(N)
-  // ret = ret + r'
-  ret->reveal(temp[10].get());
-  temp[10]->rshift(N, temp[10].get());
-  r_integer_fixed->add(temp[10].get(), ret);
+    if (party() == 2) {
+        for (int i = 0; i < 7; ++i) {
+            temp.emplace_back(
+                tensor_factory()->template create<T>(op->shape()));
+        }
+        // r',  contraint in (-2^62, 2^62)
+        // notice : when r' is contrainted in (-2^62, 2^62),
+        // the SD (statistical distance) of x - r' between this
+        // and r' in Z_{2^64} is equal to |X| / (2^63 + |X|)
+        // according to http://yuyu.hk/files/ho2.pdf
+        aby3_ctx()->template gen_random_private(*temp[0]);
+        int64_t contraint_upper = ~((uint64_t) 1 << 62);
+        int64_t contraint_low = (uint64_t) 1 << 62;
+        std::for_each(temp[0]->data(), temp[0]->data() + temp[0]->numel(),
+                      [&contraint_upper, &contraint_low] (T& a) {
+                          // contraint -2^62 < a < 2^62
+                          if (a >= 0) {
+                              a &= contraint_upper;
+                          } else {
+                              a |= contraint_low;
+                          }
+                      });
+
+        //r'_0, r'_1
+        aby3_ctx()->template gen_random_private(*temp[1]);
+        temp[0]->sub(temp[1].get(), temp[2].get());
+        // r, r_0, r_1
+        temp[0]->rshift(scaling_factor, temp[3].get());
+        aby3_ctx()->template gen_random_private(*temp[4]);
+        aby3_ctx()->template gen_random_private(*temp[5]);
+        // r_2
+        temp[3]->sub(temp[4].get(), temp[6].get());
+        temp[6]->sub(temp[5].get(), temp[6].get());
+
+        aby3_ctx()->network()->template send(1, *temp[2]);
+        aby3_ctx()->network()->template send(1, *temp[5]);
+        aby3_ctx()->network()->template send(0, *temp[1]);
+        aby3_ctx()->network()->template send(0, *temp[4]);
+
+        temp[6]->copy(ret->mutable_share(0));
+
+    } else if (party() == 1) {
+        for (int i = 0; i < 4; ++i) {
+            temp.emplace_back(
+                tensor_factory()->template create<T>(op->shape()));
+        }
+        // r'_1, r_1
+        aby3_ctx()->network()->template recv(2, *temp[0]);
+        aby3_ctx()->network()->template recv(2, *temp[1]);
+        // recv x0 - r'_0 from party 0
+        aby3_ctx()->network()->template recv(0, *temp[2]);
+        //reveal x - r' to party 1
+        op->share(0)->add(op->share(1), temp[3].get());
+        temp[3]->add(temp[2].get(), temp[3].get());
+        temp[3]->sub(temp[0].get(), temp[3].get());
+        // truncate x-r'
+        temp[3]->rshift(scaling_factor, temp[3].get());
+
+        temp[3]->add(temp[1].get(), ret->mutable_share(0));
+    } else {
+        for (int i = 0; i < 3; ++i) {
+            temp.emplace_back(
+                tensor_factory()->template create<T>(op->shape()));
+        }
+        // r'_0, r_0
+        aby3_ctx()->network()->template recv(2, *temp[0]);
+        aby3_ctx()->network()->template recv(2, *temp[1]);
+        //send x0 - r'_0 to party 1
+        op->share(0)->sub(temp[0].get(), temp[2].get());
+        aby3_ctx()->network()->template send(1, *temp[2]);
+        temp[1]->copy(ret->mutable_share(0));
+    }
+
+    reshare(ret->share(0), ret->mutable_share(1));
+
+    // compensation for carry in
+    auto tensor_carry_in = tensor_factory()->template create<T>(ret->shape());
+    assign_to_tensor(tensor_carry_in.get(), (T)1);
+    tensor_carry_in->scaling_factor() = N;
+    ret->add(tensor_carry_in.get(), ret);
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::mul(const TensorAdapter<T> *rhs,
-                                 FixedPointTensor<T, N> *ret) const {
+template<typename T, size_t N>
+template<typename MulFunc>
+void FixedPointTensor<T, N>::mul_trunc(const FixedPointTensor<T, N>* lhs,
+                                        const FixedPointTensor<T, N>* rhs,
+                                        FixedPointTensor<T, N>* ret,
+                                        MulFunc mul_func) {
+
+    auto r_zero = tensor_factory()->template create<T>(ret->shape());
+    aby3_ctx()->gen_zero_sharing_arithmetic(*r_zero.get());
+
+    // temp = _share[0]->mul(rhs->_share[0]) +
+    //        _share[0]->mul(rhs->_share[1]) +
+    //        _share[1]->mul(rhs->_share[0]) +
+    //        r_zero
+    auto temp = tensor_factory()->template create<T>(ret->shape());
+    auto temp1 = tensor_factory()->template create<T>(ret->shape());
+
+    // use mul_func to fit both element_wise mul and mat mul
+    (lhs->share(0)->*mul_func)(rhs->share(0), temp.get());
+    (lhs->share(0)->*mul_func)(rhs->share(1), temp1.get());
+    temp1->add(temp.get(), temp1.get());
+
+    (lhs->share(1)->*mul_func)(rhs->share(0), temp.get());
+    temp1->add(r_zero.get(), temp1.get());
+    temp->add(temp1.get(), temp.get());
+
+    auto temp2 = tensor_factory()->template create<T>(ret->shape());
+    auto temp3 = tensor_factory()->template create<T>(ret->shape());
+
+    TensorAdapter<int64_t>* temp_array[2] = {temp2.get(), temp3.get()};
+
+    std::shared_ptr<FixedPointTensor<T, N>> ret_no_trunc =
+            std::make_shared<FixedPointTensor<T, N>>(temp_array);
+
+    temp->copy(ret_no_trunc->_share[0]);
+    reshare(temp.get(), ret_no_trunc->_share[1]);
+
+    truncate3(ret_no_trunc.get(), ret, N);
+}
+
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::mul(const TensorAdapter<T>* rhs,
+                                 FixedPointTensor<T, N>* ret) const {
    // PADDLE_ENFORCE_EQ(N, rhs->scaling_factor(),
    //                   "no match scaling factor");
    auto temp0 = tensor_factory()->template create<T>(this->shape());
@@ -295,16 +360,16 @@ void FixedPointTensor<T, N>::mul(const TensorAdapter<T> *rhs,

    _share[0]->mul(rhs, temp->_share[0]);
    _share[1]->mul(rhs, temp->_share[1]);
-  truncate1(temp.get(), ret, rhs->scaling_factor());
+    truncate3(temp.get(), ret, rhs->scaling_factor());
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::sum(FixedPointTensor<T, N> *ret) const {
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::sum(FixedPointTensor<T, N>* ret) const {
    PADDLE_ENFORCE_EQ(ret->numel(), 1, "output size should be 1.");
-  T sum1 = (T)0;
-  T sum2 = (T)0;
-  T *iter_0 = _share[0]->data();
-  T *iter_1 = _share[1]->data();
+    T sum1 = (T) 0;
+    T sum2 = (T) 0;
+    T* iter_0 = _share[0]->data();
+    T* iter_1 = _share[1]->data();
    for (int i = 0; i < this->numel(); ++i) {
        sum1 += *(iter_0 + i);
        sum2 += *(iter_1 + i);
@@ -313,10 +378,11 @@ void FixedPointTensor<T, N>::sum(FixedPointTensor<T, N> *ret) const {
    assign_to_tensor(ret->_share[1], sum2);
 }

-template <typename T, size_t N>
-template <template <typename U, size_t...> class CTensor, size_t... N1>
-void FixedPointTensor<T, N>::dot_mul(const CTensor<T, N1...> *rhs,
-                                     FixedPointTensor<T, N> *ret) const {
+template<typename T, size_t N>
+template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+void FixedPointTensor<T, N>::dot_mul(const CTensor<T, N1...>* rhs,
+                                     FixedPointTensor<T, N>* ret) const {
    PADDLE_ENFORCE_EQ(ret->numel(), 1, "output size should be 1.");

    auto temp0 = tensor_factory()->template create<T>(this->shape());
@@ -327,76 +393,61 @@ void FixedPointTensor<T, N>::dot_mul(const CTensor<T, N1...> *rhs,
    temp->sum(ret);
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::mat_mul(const FixedPointTensor<T, N> *rhs,
-                                     FixedPointTensor<T, N> *ret) const {
-
-  auto r_zero = tensor_factory()->template create<T>(ret->shape());
-  aby3_ctx()->gen_zero_sharing_arithmetic(*r_zero.get());
-
-  // temp = _share[0]->mat_mul(rhs->_share[0]) +
-  //        _share[0]->mat_mul(rhs->_share[1]) +
-  //        _share[1]->mat_mul(rhs->_share[0]) +
-  //        r_zero
-  auto temp = tensor_factory()->template create<T>(ret->shape());
-  auto temp1 = tensor_factory()->template create<T>(ret->shape());
-
-  _share[0]->mat_mul(rhs->_share[0], temp.get());
-  _share[0]->mat_mul(rhs->_share[1], temp1.get());
-  temp1->add(temp.get(), temp1.get());
-
-  _share[1]->mat_mul(rhs->_share[0], temp.get());
-  temp1->add(r_zero.get(), temp1.get());
-  temp->add(temp1.get(), temp.get());
-
-  auto temp2 = tensor_factory()->template create<T>(ret->shape());
-  auto temp3 = tensor_factory()->template create<T>(ret->shape());
-
-  TensorAdapter<int64_t> *temp_array[2] = {temp2.get(), temp3.get()};
-
-  std::shared_ptr<FixedPointTensor<T, N>> ret_no_trunc =
-      std::make_shared<FixedPointTensor<T, N>>(temp_array);
-
-  temp->copy(ret_no_trunc->_share[0]);
-  reshare(temp.get(), ret_no_trunc->_share[1]);
-
-  truncate1(ret_no_trunc.get(), ret, N);
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::mat_mul(const FixedPointTensor<T, N>* rhs,
+                                     FixedPointTensor<T, N>* ret) const {
+    mul_trunc(this, rhs, ret, &TensorAdapter<T>::mat_mul);
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::mat_mul(const TensorAdapter<T> *rhs,
-                                     FixedPointTensor<T, N> *ret) const {
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::mat_mul(const TensorAdapter<T>* rhs,
+                                     FixedPointTensor<T, N>* ret) const {
    _share[0]->mat_mul(rhs, ret->_share[0]);
    _share[1]->mat_mul(rhs, ret->_share[1]);
-  truncate1(ret, ret, rhs->scaling_factor());
+    truncate3(ret, ret, rhs->scaling_factor());
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::div(const TensorAdapter<T> *rhs,
-                                 FixedPointTensor<T, N> *ret) const {
-  PADDLE_ENFORCE_EQ(N, rhs->scaling_factor(), "no match scaling factor");
+template< typename T, size_t N>
+void FixedPointTensor<T, N>::div(const TensorAdapter<T>* rhs,
+                                 FixedPointTensor<T, N>* ret) const {
+    PADDLE_ENFORCE_EQ(N, rhs->scaling_factor(),
+                        "no match scaling factor");

    auto temp = tensor_factory()->template create<T>(this->shape());

    double scale = std::pow(2, rhs->scaling_factor());
-  auto inverse = [scale](T d) -> T { return 1.0 * scale / d * scale; };
-  std::transform(rhs->data(), rhs->data() + rhs->numel(), temp->data(),
-                 inverse);
+    auto inverse = [scale](T d) -> T {
+                    return 1.0 * scale / d * scale; };
+    std::transform(rhs->data(), rhs->data() + rhs->numel(),
+                                temp->data(), inverse);
    temp->scaling_factor() = rhs->scaling_factor();

    this->mul(temp.get(), ret);
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::exp(FixedPointTensor<T, N> *ret,
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::div(const FixedPointTensor<T, N>* rhs,
+                                 FixedPointTensor<T, N>* ret,
+                                 size_t iter, double x0) const {
+    auto temp0 = tensor_factory()->template create<T>(ret->shape());
+    auto temp1 = tensor_factory()->template create<T>(ret->shape());
+    std::shared_ptr<FixedPointTensor<T, N>> temp =
+        std::make_shared<FixedPointTensor<T, N>>(temp0.get(), temp1.get());
+    reciprocal(rhs, temp.get(), iter, x0);
+    this->mul(temp.get(), ret);
+}
+
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::exp(FixedPointTensor<T, N>* ret,
                                 size_t iter) const {
    // exp approximate: exp(x) = \lim_{n->inf} (1+x/n)^n
+    // where n = 2^ite
    auto pow_iter = tensor_factory()->template create<T>(this->shape());
-  assign_to_tensor(pow_iter.get(), (T)(pow(2, N - iter)));
+    assign_to_tensor(pow_iter.get(), (T) (pow(2, N -iter)));
    pow_iter->scaling_factor() = N;

    auto tensor_one = tensor_factory()->template create<T>(this->shape());
-  assign_to_tensor(tensor_one.get(), (T)1 << N);
+    assign_to_tensor(tensor_one.get(), (T) 1 << N);
    tensor_one->scaling_factor() = N;

    this->mul(pow_iter.get(), ret);
@@ -408,79 +459,130 @@ void FixedPointTensor<T, N>::exp(FixedPointTensor<T, N> *ret,
    }
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::relu(FixedPointTensor<T, N> *ret) const {
-  // utilize polynomial_piecewise
+template< typename T, size_t N>
+void FixedPointTensor<T, N>::relu(FixedPointTensor<T, N>* ret) const {
+    //utilize polynomial_piecewise
    // break_point = {0}, coeff[0] = {0, 0}, coeff[1] = {0, 1}
    // break_point.shape = {1, this->shape}, coeff.shape = {2, 2, this->shape}

    auto shape_ = shape();
-  // construct break_point
+    //construct break_point
    auto b_shape = shape_;
    b_shape.insert(b_shape.begin(), 1);

    auto break_point = tensor_factory()->template create<T>(b_shape);

-  T *b_ptr = break_point->data();
+    T* b_ptr = break_point->data();
    for (size_t i = 0; i < break_point->numel(); ++i) {
        b_ptr[i] = 0;
    }
    break_point->scaling_factor() = N;

-  // contruct coeff
+    //contruct coeff
    std::vector<size_t> c_shape = {2, 2};
    c_shape.insert(c_shape.end(), shape_.begin(), shape_.end());

    auto coeff = tensor_factory()->template create<T>(c_shape);

-  T *c_ptr = coeff->data();
+    T* c_ptr = coeff->data();

    for (size_t i = 0; i < 3 * this->numel(); ++i) {
        c_ptr[i] = 0;
    }
    for (size_t i = 3 * this->numel(); i < 4 * this->numel(); ++i) {
-    c_ptr[i] = (T)1 << N;
+        c_ptr[i] = (T) 1 << N;
    }
    coeff->scaling_factor() = N;

    this->polynomial_piecewise(coeff.get(), break_point.get(), ret);
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::sigmoid(FixedPointTensor<T, N> *ret) const {
-  // utilize polynomial_piecewise
+template< typename T, size_t N>
+void FixedPointTensor<T, N>::relu_with_derivative(
+    FixedPointTensor<T, N>* ret, BooleanTensor<T>* derivative) const {
+
+    auto shape_ = shape();
+    auto zero = tensor_factory()->template create<T>(shape_);
+
+    assign_to_tensor(zero.get(), (T)0);
+    zero->scaling_factor() = N;
+
+    auto tmp0 = tensor_factory()->template create<T>(shape_);
+    auto tmp1 = tensor_factory()->template create<T>(shape_);
+
+    BooleanTensor<T> der(tmp0.get(), tmp1.get());
+
+    gt(zero.get(), &der);
+
+    der.mul(this, ret);
+
+    if (derivative) {
+        der.share(0)->copy(derivative->share(0));
+        der.share(1)->copy(derivative->share(1));
+    }
+}
+
+template< typename T, size_t N>
+void FixedPointTensor<T, N>::sigmoid_chebyshev(FixedPointTensor<T, N>* ret) const {
+    //utilize Chebyshev polynomial approximation
+    // more accurate in small range, such as [-4, 4]
+    auto shape = ret->shape();
+    std::vector<size_t> shape_ = shape;
+    shape_.insert(shape_.begin(), 10);
+    auto numel = ret->numel();
+    auto coeff = tensor_factory()->template create<T>(shape_);
+    std::vector<double> w;
+    w.resize(10, 0.0f);
+    w[0] = 0.5;
+    w[1] = 0.2159198015;
+    w[3] = -0.0082176259;
+    w[5] = 0.0001825597;
+    w[7] = -0.0000018848;
+    w[9] = 0.0000000072;
+    for (int i = 0; i < 10; ++i) {
+        for (int j = 0; j < numel; ++j) {
+            *(coeff->data() + i * numel + j) = (T) (w[i] * pow(2, N));
+        }
+    }
+    coeff->scaling_factor() = N;
+    polynomial(coeff.get(), ret);
+}
+
+template< typename T, size_t N>
+void FixedPointTensor<T, N>::sigmoid(FixedPointTensor<T, N>* ret) const {
+    //utilize polynomial_piecewise
    // break_point = {-2.5, 2.5}
    // coeff[0] = {10^-4, 0}, coeff[1] = {0.5, 0.17}
    // coeff[2] = {1 - 10^-4, 0}
    // break_point.shape = {2, this->shape}, coeff.shape = {3, 2, this->shape}

-  // construct break_point
+    //construct break_point
    auto shape_ = shape();
-  // construct break_point
+    //construct break_point
    auto b_shape = shape_;
    b_shape.insert(b_shape.begin(), 2);

    auto break_point = tensor_factory()->template create<T>(b_shape);

-  T *b_ptr = break_point->data();
+    T* b_ptr = break_point->data();
    for (size_t i = 0; i < break_point->numel(); ++i) {
        b_ptr[i] = 0;
    }
    for (size_t i = 0; i < break_point->numel() / 2; ++i) {
-    b_ptr[i] = (T)(-2.5 * pow(2, N));
+        b_ptr[i] = (T) (-2.5 * pow(2, N));
    }
    for (size_t i = break_point->numel() / 2; i < break_point->numel(); ++i) {
-    b_ptr[i] = (T)(2.5 * pow(2, N));
+        b_ptr[i] = (T) (2.5 * pow(2, N));
    }
    break_point->scaling_factor() = N;

-  // contruct coeff
+    //contruct coeff
    std::vector<size_t> c_shape = {3, 2};
    c_shape.insert(c_shape.end(), shape_.begin(), shape_.end());

    auto coeff = tensor_factory()->template create<T>(c_shape);

-  T *c_ptr = coeff->data();
+    T* c_ptr = coeff->data();

    size_t numel = this->numel();
    double scale = std::pow(2, N);
@@ -497,98 +599,306 @@ void FixedPointTensor<T, N>::sigmoid(FixedPointTensor<T, N> *ret) const {
    this->polynomial_piecewise(coeff.get(), break_point.get(), ret);
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::softmax(FixedPointTensor<T, N> *ret) const {
-  // relu_x = relu(this)
-  auto &shape = this->shape();
-  auto temp0 = tensor_factory()->template create<T>(this->shape());
-  auto temp1 = tensor_factory()->template create<T>(this->shape());
-  std::shared_ptr<FixedPointTensor<T, N>> relu_x =
-      std::make_shared<FixedPointTensor<T, N>>(temp0.get(), temp1.get());
-  this->relu(relu_x.get());
+template< typename T, size_t N>
+void FixedPointTensor<T, N>::sigmoid_enhanced(FixedPointTensor<T, N>* ret) const {
+    //utilize polynomial_piecewise
+    // break_point = {-5, -2.5, 2.5, 5}
+    // coeff[0] = {10^-4, 0}, coeff[1] = {0.145, 0.02776}
+    // coeff[2] = {0.5, 0.17}, coeff[3] = {0.85498, 0.02776}, coeff[4] = {0.9999, 0}
+    // break_point.shape = {4, this->shape}, coeff.shape = {5, 2, this->shape}
+
+    //construct break_point
+    auto shape_ = shape();
+    //construct break_point
+    auto b_shape = shape_;
+    b_shape.insert(b_shape.begin(), 4);

-  // get sum: reduce shape : from this->shape() to
-  // this->shape()[0],...,shape()[n-2]
-  std::vector<size_t> shape_sum;
-  for (int i = 0; i < shape.size() - 1; ++i) {
-    shape_sum.emplace_back(shape[i]);
+    auto break_point = tensor_factory()->template create<T>(b_shape);
+
+    T* b_ptr = break_point->data();
+    auto numel = ret->numel();
+    double scale = std::pow(2, N);
+    for (size_t i = 0; i < numel; ++i) {
+        b_ptr[i] = (T) (-5 * scale);
+        b_ptr[i + numel] = (T) (-2.5 * scale);
+        b_ptr[i + 2 * numel] = (T) (2.5 * scale);
+        b_ptr[i + 3 * numel] = (T) (5 * scale);
    }
+    break_point->scaling_factor() = N;

-  auto temp2 = tensor_factory()->template create<T>(shape_sum);
-  auto temp3 = tensor_factory()->template create<T>(shape_sum);
-  std::shared_ptr<FixedPointTensor<T, N>> sum =
-      std::make_shared<FixedPointTensor<T, N>>(temp2.get(), temp3.get());
+    //contruct coeff
+    std::vector<size_t> c_shape = {5, 2};
+    c_shape.insert(c_shape.end(), shape_.begin(), shape_.end());
+    auto coeff = tensor_factory()->template create<T>(c_shape);
+    T* c_ptr = coeff->data();
+    for (size_t i = 0; i < numel; ++i) {
+        c_ptr[i] = 0.0001 * scale;
+        c_ptr[i + numel] = 0;
+        c_ptr[i + 2 * numel] = 0.145 * scale;
+        c_ptr[i + 3 * numel] = 0.02776 * scale;
+        c_ptr[i + 4 * numel] = 0.5 * scale;
+        c_ptr[i + 5 * numel] = 0.17 * scale;
+        c_ptr[i + 6 * numel] = 0.85498 * scale;
+        c_ptr[i + 7 * numel] = 0.02776 * scale;
+        c_ptr[i + 8 * numel] = 0.9999 * scale;
+        c_ptr[i + 9 * numel] = 0 * scale;
+    }
+    coeff->scaling_factor() = N;

-  // reduce relu_x's last dim
-  reduce(relu_x.get(), sum.get());
+    this->polynomial_piecewise(coeff.get(), break_point.get(), ret);
+}

-  // reveal (TODO: security improve)
-  auto sum_plain = tensor_factory()->template create<T>(sum->shape());
-  sum->reveal(sum_plain.get());
+template< typename T, size_t N>
+void FixedPointTensor<T, N>::softmax(FixedPointTensor<T, N>* ret,
+                                     bool use_relu, bool use_long_div) const {
+    // softmax axis = -1
+    const size_t col = *(shape().end() - 1);
+    const size_t row = numel() / col;

-  // extend sum_plain shape to relu_x->shape(), padding with sum_value
-  auto sum_extend = tensor_factory()->template create<T>(relu_x->shape());
-  sum_extend->scaling_factor() = N;
-  T *sum_ext_ptr = sum_extend->data();
-  T *sum_plain_ptr = sum_plain->data();
+    std::vector<std::shared_ptr<TensorAdapter<T>>> temp;
+    // 11 for allocating temp tensor
+    for (size_t i = 0; i < 11; ++i) {
+        temp.emplace_back(
+            tensor_factory()->template create<T>());
+    }

-  size_t ite_size = shape[shape.size() - 1];
-  for (int j = 0; j < sum_plain->numel(); ++j) {
-    for (int i = 0; i < ite_size; ++i) {
-      *(sum_ext_ptr + j * ite_size + i) = *(sum_plain_ptr + j);
+    temp[0]->reshape({row, col});
+    temp[1]->reshape({row, col});
+    FixedPointTensor<T, N> x(temp[0].get(), temp[1].get());
+
+    if (!use_relu) {
+        temp[2]->reshape({col, row});
+        temp[3]->reshape({col, row});
+
+        temp[4]->reshape({1, row});
+        temp[5]->reshape({1, row});
+    }
+    FixedPointTensor<T, N> x_t(temp[2].get(), temp[3].get());
+    FixedPointTensor<T, N> max_x_t(temp[4].get(), temp[5].get());
+
+    temp[6]->reshape({row, 1});
+    temp[7]->reshape({row, 1});
+    FixedPointTensor<T, N> max_x(temp[6].get(), temp[7].get());
+
+    temp[8]->reshape({row, col});
+    temp[9]->reshape({row, col});
+    FixedPointTensor<T, N> max_x_broadcast(temp[8].get(), temp[9].get());
+
+    temp[10]->reshape({row, col});
+    auto exp_lower_bound = temp[10].get();
+
+    auto transpose = [](const TensorAdapter<T>* in, TensorAdapter<T>* out) {
+        // suppose input dims = 2
+        const size_t col = in->shape()[1];
+        const size_t row = in->shape()[0];
+        const size_t numel = in->numel();
+
+        for (size_t k = 0; k < numel; ++k) {
+            size_t i = k / row;
+            size_t j = k % row;
+            out->data()[k] = in->data()[j * col + i];
+        }
+    };
+
+    auto broadcast = [](const TensorAdapter<T>* in, TensorAdapter<T>* out) {
+        // suppose input dims = 2
+        // in shape = [row, 1]
+        const size_t col = out->shape()[1];
+        const size_t row = out->shape()[0];
+        for (size_t k = 0; k < out->numel(); ++k) {
+            size_t i = k / col;
+            out->data()[k] = in->data()[i];
+        }
+    };
+
+    share(0)->copy(x.mutable_share(0));
+    share(1)->copy(x.mutable_share(1));
+
+    if (use_relu) {
+
+        x.relu(&x);
+
+    } else { // use exp
+        transpose(x.share(0), x_t.mutable_share(0));
+        transpose(x.share(1), x_t.mutable_share(1));
+
+        // x = max(input - max(input), exp_lower_bound)
+        x_t.max_pooling(&max_x_t);
+
+        transpose(max_x_t.share(0), max_x.mutable_share(0));
+        transpose(max_x_t.share(1), max_x.mutable_share(1));
+
+        broadcast(max_x.share(0), max_x_broadcast.mutable_share(0));
+        broadcast(max_x.share(1), max_x_broadcast.mutable_share(1));
+
+        x.sub(&max_x_broadcast, &x);
+
+        // n = 64, see exp
+        assign_to_tensor(exp_lower_bound, (T)(-64 * (1 << N)));
+        exp_lower_bound->scaling_factor() = N;
+
+        x.sub(exp_lower_bound, &x);
+        x.relu(&x);
+        x.add(exp_lower_bound, &x);
+
+        x.exp(&x);
    }
+
+    // reuse max_x as sum
+    reduce(&x, &max_x);
+
+    if (!use_long_div) { // invert sum by Newton's method
+    // divisor range = [1/col, 1.0]
+    // TODO: find better iter num & init val
+        reciprocal(&max_x, &max_x, 16, 0.5 / col);
    }

-  relu_x->div(sum_extend.get(), ret);
+    broadcast(max_x.share(0), max_x_broadcast.mutable_share(0));
+    broadcast(max_x.share(1), max_x_broadcast.mutable_share(1));
+
+    if (use_long_div) {
+        x.long_div(&max_x_broadcast, &x, 1);
+    } else {
+        x.mul(&max_x_broadcast, &x);
+    }
+
+    x.share(0)->copy(ret->mutable_share(0));
+    x.share(1)->copy(ret->mutable_share(1));
+}
+
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::long_div(const FixedPointTensor<T, N>* rhs,
+                                 FixedPointTensor<T, N>* ret,
+                                 size_t int_len) const {
+    std::vector<std::shared_ptr<TensorAdapter<T>>> temp;
+    for (int i = 0; i < 16; ++i) {
+        temp.emplace_back(
+            tensor_factory()->template create<T>(ret->shape()));
+    }
+
+    BooleanTensor<T> sign_lhs(temp[0].get(), temp[1].get());
+    BooleanTensor<T> sign_rhs(temp[2].get(), temp[3].get());
+    BooleanTensor<T> sign_ret(temp[4].get(), temp[5].get());
+    FixedPointTensor<T, N> abs_lhs(temp[6].get(), temp[7].get());
+    FixedPointTensor<T, N> abs_rhs(temp[8].get(), temp[9].get());
+    FixedPointTensor<T, N> sub_rhs(temp[10].get(), temp[11].get());
+    BooleanTensor<T> cmp_res(temp[12].get(), temp[13].get());
+    BooleanTensor<T> cmp_res_all(temp[14].get(), temp[15].get());
+
+    assign_to_tensor(cmp_res_all.share(0), (T)0);
+    assign_to_tensor(cmp_res_all.share(1), (T)0);
+
+    const size_t msb = sizeof(T) * 8 - 1;
+    sign_lhs.bit_extract(msb, this);
+    sign_rhs.bit_extract(msb, rhs);
+    sign_lhs.bitwise_xor(&sign_rhs, &sign_ret);
+
+    auto lshift = []  (const FixedPointTensor<T, N>* in,
+                       size_t rhs,
+                       FixedPointTensor<T, N>* out) {
+        in->share(0)->lshift(rhs, out->mutable_share(0));
+        in->share(1)->lshift(rhs, out->mutable_share(1));
+    };
+
+    // abs = val - 2 * sign * val
+    auto abs = [lshift] (const FixedPointTensor<T, N>* in,
+                   const BooleanTensor<T>* sign,
+                   FixedPointTensor<T, N>* out) {
+        lshift(in, 1, out);
+        sign->mul(out, out);
+        in->sub(out, out);
+    };
+
+    auto out0 = tensor_factory()->template create<T>(ret->shape());
+
+    abs(this, &sign_lhs, &abs_lhs);
+
+    abs(rhs, &sign_rhs, &abs_rhs);
+
+
+    for (ssize_t i = int_len - 1; i >= 0; --i) {
+        lshift(&abs_rhs, i, &sub_rhs);
+
+
+        abs_lhs.gt(&sub_rhs, &cmp_res);
+
+
+        cmp_res.mul(&sub_rhs, &sub_rhs);
+        cmp_res.lshift(N + i, &cmp_res);
+        abs_lhs.sub(&sub_rhs, &abs_lhs);
+        cmp_res.bitwise_xor(&cmp_res_all, &cmp_res_all);
+
+    }
+
+    for (size_t i = 1; i <= N; ++i) {
+        truncate3(&abs_rhs, &sub_rhs, i);
+        abs_lhs.gt(&sub_rhs, &cmp_res);
+        cmp_res.mul(&sub_rhs, &sub_rhs);
+        cmp_res.lshift(N - i, &cmp_res);
+        abs_lhs.sub(&sub_rhs, &abs_lhs);
+        cmp_res.bitwise_xor(&cmp_res_all, &cmp_res_all);
+    }
+
+    // use abs_lhs as buffer
+    cmp_res_all.b2a(&abs_lhs);
+
+    abs(&abs_lhs, &sign_ret, ret);
 }

 // reduce last dim
 template <typename T, size_t N>
-void FixedPointTensor<T, N>::reduce(FixedPointTensor<T, N> *input,
-                                    FixedPointTensor<T, N> *ret) {
-  // enfoce shape: input->shape[0 ... (n-2)] == ret shape
-  auto &shape = input->shape();
+void FixedPointTensor<T, N>::reduce(FixedPointTensor<T, N>* input,
+                                    FixedPointTensor<T, N>* ret) {
+    //enfoce shape: input->shape[0 ... (n-2)] == ret shape
+    auto& shape = input->shape();
    size_t ite_size = shape[shape.size() - 1];

-  T *ret_begin_ptr_0 = ret->_share[0]->data();
-  T *ret_begin_ptr_1 = ret->_share[1]->data();
+    T* ret_begin_ptr_0 = ret->_share[0]->data();
+    T* ret_begin_ptr_1 = ret->_share[1]->data();

-  T *input_begin_ptr_0 = input->_share[0]->data();
-  T *input_begin_ptr_1 = input->_share[1]->data();
+    T* input_begin_ptr_0 = input->_share[0]->data();
+    T* input_begin_ptr_1 = input->_share[1]->data();

    for (int j = 0; j < ret->numel(); ++j) {
        *(ret_begin_ptr_0 + j) = *(input_begin_ptr_0 + j * ite_size);
        *(ret_begin_ptr_1 + j) = *(input_begin_ptr_1 + j * ite_size);
        for (int i =  1; i < ite_size; ++i) {
-      *(ret_begin_ptr_0 + j) += *(input_begin_ptr_0 + j * ite_size + i);
-      *(ret_begin_ptr_1 + j) += *(input_begin_ptr_1 + j * ite_size + i);
+            *(ret_begin_ptr_0 + j) +=
+                        *(input_begin_ptr_0 + j * ite_size + i);
+            *(ret_begin_ptr_1 + j) +=
+                        *(input_begin_ptr_1 + j * ite_size + i);
        }
    }
 }

-template <typename T, size_t N>
-void FixedPointTensor<T, N>::polynomial(const TensorAdapter<T> *coeff,
-                                        FixedPointTensor<T, N> *ret) const {
+template< typename T, size_t N>
+void FixedPointTensor<T, N>::polynomial(const TensorAdapter<T>* coeff,
+                                        FixedPointTensor<T, N>* ret) const {

    // e.g., x.shape = {2, 3}, coeff.shape = {n, 2, 3} (n: polynomial power)

-  // TODO: improve performance: [ABY3]
+    //TODO: improve performance: [ABY3]
    std::vector<std::shared_ptr<TensorAdapter<T>>> temp;
-  for (int i = 0; i < 5; ++i) {
-    temp.emplace_back(tensor_factory()->template create<T>(this->shape()));
+    for (int i = 0; i < 7; ++i) {
+        temp.emplace_back(
+            tensor_factory()->template create<T>(this->shape()));
    }
    std::shared_ptr<FixedPointTensor<T, N>> x_pow_i =
-      std::make_shared<FixedPointTensor<T, N>>(temp[0].get(), temp[1].get());
+            std::make_shared<FixedPointTensor<T, N>>(
+                                temp[0].get(), temp[1].get());
    std::shared_ptr<FixedPointTensor<T, N>> temp_fixed =
-      std::make_shared<FixedPointTensor<T, N>>(temp[2].get(), temp[3].get());
-
-  assign_to_tensor(ret->_share[0], (T)0);
-  assign_to_tensor(ret->_share[1], (T)0);
-
-  // x_pow_i.get() = 1;
-  assign_to_tensor(x_pow_i.get()->_share[0], (T)0);
-  assign_to_tensor(x_pow_i.get()->_share[1], (T)0);
-  assign_to_tensor(temp[4].get(), (T)1 << N);
+            std::make_shared<FixedPointTensor<T, N>>(
+                                temp[2].get(), temp[3].get());
+    std::shared_ptr<FixedPointTensor<T, N>> result =
+            std::make_shared<FixedPointTensor<T, N>>(
+                                temp[5].get(), temp[6].get());
+    assign_to_tensor(result->_share[0], (T) 0);
+    assign_to_tensor(result->_share[1], (T) 0);
+
+    //x_pow_i.get() = 1;
+    assign_to_tensor(x_pow_i.get()->_share[0], (T) 0);
+    assign_to_tensor(x_pow_i.get()->_share[1], (T) 0);
+    assign_to_tensor(temp[4].get(), (T) 1 << N);
    temp[4]->scaling_factor() = N;
    x_pow_i->add(temp[4].get(), x_pow_i.get());

@@ -600,31 +910,42 @@ void FixedPointTensor<T, N>::polynomial(const TensorAdapter<T> *coeff,
        t_shape.erase(t_shape.begin());
        t->reshape(t_shape);
        x_pow_i->mul(t.get(), temp_fixed.get());
-    ret->add(temp_fixed.get(), ret);
+        result->add(temp_fixed.get(), result.get());
        x_pow_i->mul(this, x_pow_i.get());
    }
+    result->share(0)->copy(ret->mutable_share(0));
+    result->share(1)->copy(ret->mutable_share(1));
 }

-template <typename T, size_t N>
+template< typename T, size_t N>
 void FixedPointTensor<T, N>::polynomial_piecewise(
-    const TensorAdapter<T> *coeff, const TensorAdapter<T> *break_point,
-    FixedPointTensor<T, N> *ret) const {
+                    const TensorAdapter<T>* coeff,
+                    const TensorAdapter<T>* break_point,
+                    FixedPointTensor<T, N>* ret) const {

    // e.g., x.shape = {2, 3},
    // break_point.shape = {k, 2, 3} (k: num of break point)
    //       coeff.shape = {k + 1, n, 2, 3} (n: poly power)

+    // copy ret
+    auto ret_cpy_s0 = tensor_factory()->create_int64_t(ret->share(0)->shape());
+    ret->share(0)->copy(ret_cpy_s0.get());
+    auto ret_cpy_s1 = tensor_factory()->create_int64_t(ret->share(1)->shape());
+    ret->share(1)->copy(ret_cpy_s1.get());
+    std::shared_ptr<FixedPointTensor<T, N>> ret_cpy{new FixedPointTensor<T, N>(ret_cpy_s0.get(), ret_cpy_s1.get())};
+
    std::vector<std::shared_ptr<BooleanTensor<T>>> msb;

    int len_break_point = break_point->shape()[0];
    int len_coeff = coeff->shape()[0];

-  // number of temp tensor used
-  int temp_total =
-      4 * len_break_point + 2 + 2 * (len_break_point - 1) + 2 + 4 * len_coeff;
+    //number of temp tensor used
+    int temp_total = 4 * len_break_point + 2 +
+                     2 * (len_break_point - 1) + 2 + 4 * len_coeff;
    std::vector<std::shared_ptr<TensorAdapter<T>>> temp;
    for (int i = 0; i < temp_total; ++i) {
-    temp.emplace_back(tensor_factory()->template create<T>(this->shape()));
+        temp.emplace_back(tensor_factory()->
+                          template create<T>(this->shape()));
    }
    int temp_index = 0;

@@ -641,29 +962,35 @@ void FixedPointTensor<T, N>::polynomial_piecewise(
        t_shape.erase(t_shape.begin());
        t_break->reshape(t_shape);

-    temp1.emplace_back(std::make_shared<FixedPointTensor<T, N>>(
-        temp[temp_index++].get(), temp[temp_index++].get()));
+        temp1.emplace_back(
+                    std::make_shared<FixedPointTensor<T, N>>(
+                                    temp[temp_index++].get(),
+                                    temp[temp_index++].get()));
        this->sub(t_break.get(), temp1[i].get());
        msb.emplace_back(std::make_shared<BooleanTensor<T>>(
-        temp[temp_index++].get(), temp[temp_index++].get()));
+                                    temp[temp_index++].get(),
+                                    temp[temp_index++].get()));
        msb[i]->bit_extract(sizeof(T) * 8 - 1, temp1[i].get());
    }

    // b[0] = msb[0], b[i + 1] = ~ msb[i] & msb[i + 1]
    std::vector<std::shared_ptr<BooleanTensor<T>>> b;
-  b.emplace_back(std::make_shared<BooleanTensor<T>>(temp[temp_index++].get(),
+    b.emplace_back(std::make_shared<BooleanTensor<T>>(
+                                    temp[temp_index++].get(),
                                    temp[temp_index++].get()));
    b[0] = msb[0];

    for (int i = 0; i < len_break_point - 1; ++i) {
        b.emplace_back(std::make_shared<BooleanTensor<T>>(
-        temp[temp_index++].get(), temp[temp_index++].get()));
+                                    temp[temp_index++].get(),
+                                    temp[temp_index++].get()));

        msb[i]->bitwise_not(b[i + 1].get());
        b[i + 1]->bitwise_and(msb[i + 1].get(), b[i + 1].get());
    }

-  b.emplace_back(std::make_shared<BooleanTensor<T>>(temp[temp_index++].get(),
+    b.emplace_back(std::make_shared<BooleanTensor<T>>(
+                                    temp[temp_index++].get(),
                                    temp[temp_index++].get()));
    msb[len_break_point - 1]->bitwise_not(b[len_break_point].get());

@@ -671,119 +998,299 @@ void FixedPointTensor<T, N>::polynomial_piecewise(
    std::vector<std::shared_ptr<FixedPointTensor<T, N>>> temp_fixed;
    std::vector<std::shared_ptr<FixedPointTensor<T, N>>> temp_fixed1;

-  assign_to_tensor(ret->_share[0], (T)0);
-  assign_to_tensor(ret->_share[1], (T)0);
+    assign_to_tensor(ret_cpy->_share[0], (T) 0);
+    assign_to_tensor(ret_cpy->_share[1], (T) 0);

    for (int i = 0; i < len_coeff; ++i) {
-    temp_fixed.emplace_back(std::make_shared<FixedPointTensor<T, N>>(
-        temp[temp_index++].get(), temp[temp_index++].get()));
-    temp_fixed1.emplace_back(std::make_shared<FixedPointTensor<T, N>>(
-        temp[temp_index++].get(), temp[temp_index++].get()));
+        temp_fixed.emplace_back(
+                    std::make_shared<FixedPointTensor<T, N>>(
+                                                temp[temp_index++].get(),
+                                                temp[temp_index++].get()));
+        temp_fixed1.emplace_back(
+                    std::make_shared<FixedPointTensor<T, N>>(
+                                                temp[temp_index++].get(),
+                                                temp[temp_index++].get()));
        auto t = tensor_factory()->template create<T>();
        coeff->slice(i, i + 1, t.get());
        auto t_shape = t->shape();
        // remove leading 1
        t_shape.erase(t_shape.begin());
-    t->reshape(t_shape);
-    ;
+        t->reshape(t_shape);;
        this->polynomial(t.get(), temp_fixed[i].get());
        b[i]->bit_extract(0, b[i].get());
        b[i]->mul(temp_fixed[i].get(), temp_fixed1[i].get());
-    ret->add(temp_fixed1[i].get(), ret);
+        ret_cpy->add(temp_fixed1[i].get(), ret_cpy.get());
    }
+    ret_cpy->share(0)->copy(ret->mutable_share(0));
+    ret_cpy->share(1)->copy(ret->mutable_share(1));
 }

-template <typename T, size_t N>
-template <template <typename U, size_t...> class CTensor, size_t... N1>
-void FixedPointTensor<T, N>::lt(const CTensor<T, N1...> *rhs,
-                                BooleanTensor<T> *ret) const {
+template<typename T, size_t N>
+template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+void FixedPointTensor<T, N>::lt(const CTensor<T, N1...>* rhs,
+                                BooleanTensor<T>* ret) const {

    std::vector<std::shared_ptr<TensorAdapter<T>>> temp;
    for (int i = 0; i < 2; ++i) {
-    temp.emplace_back(tensor_factory()->template create<T>(this->shape()));
+        temp.emplace_back(
+            tensor_factory()->template create<T>(this->shape()));
    }
    std::shared_ptr<FixedPointTensor<T, N>> sub_result =
-      std::make_shared<FixedPointTensor<T, N>>(temp[0].get(), temp[1].get());
+        std::make_shared<FixedPointTensor<T, N>>(
+                                temp[0].get(), temp[1].get());
    this->sub(rhs, sub_result.get());
    ret->bit_extract(sizeof(T) * 8 - 1, sub_result.get());
 }

-template <typename T, size_t N>
-template <template <typename U, size_t...> class CTensor, size_t... N1>
-void FixedPointTensor<T, N>::leq(const CTensor<T, N1...> *rhs,
-                                 BooleanTensor<T> *ret) const {
+template<typename T, size_t N>
+template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+void FixedPointTensor<T, N>::leq(const CTensor<T, N1...>* rhs,
+                                BooleanTensor<T>* ret) const {

    this->gt(rhs, ret);
-  auto tensor_one = tensor_factory()->template create<T>(this->shape());
+    auto tensor_one = tensor_factory()->
+                            template create<T>(this->shape());

-  assign_to_tensor(tensor_one.get(), (T)1);
+    assign_to_tensor(tensor_one.get(), (T) 1);
    ret->bitwise_xor(tensor_one.get(), ret);
 }

-template <typename T, size_t N>
-template <template <typename U, size_t...> class CTensor, size_t... N1>
-void FixedPointTensor<T, N>::gt(const CTensor<T, N1...> *rhs,
-                                BooleanTensor<T> *ret) const {
+template<typename T, size_t N>
+template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+void FixedPointTensor<T, N>::gt(const CTensor<T, N1...>* rhs,
+                                BooleanTensor<T>* ret) const {

    std::vector<std::shared_ptr<TensorAdapter<T>>> temp;
    for (int i = 0; i < 2; ++i) {
-    temp.emplace_back(tensor_factory()->template create<T>(this->shape()));
+        temp.emplace_back(
+            tensor_factory()->template create<T>(this->shape()));
    }
    std::shared_ptr<FixedPointTensor<T, N>> sub_result =
-      std::make_shared<FixedPointTensor<T, N>>(temp[0].get(), temp[1].get());
+        std::make_shared<FixedPointTensor<T, N>>(
+                                    temp[0].get(), temp[1].get());
    this->sub(rhs, sub_result.get());
    sub_result->negative(sub_result.get());
    ret->template bit_extract(sizeof(T) * 8 - 1, sub_result.get());
 }

-template <typename T, size_t N>
-template <template <typename U, size_t...> class CTensor, size_t... N1>
-void FixedPointTensor<T, N>::geq(const CTensor<T, N1...> *rhs,
-                                 BooleanTensor<T> *ret) const {
+template<typename T, size_t N>
+template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+void FixedPointTensor<T, N>::geq(const CTensor<T, N1...>* rhs,
+                                BooleanTensor<T>* ret) const {

    this->lt(rhs, ret);
-  auto tensor_one = tensor_factory()->template create<T>(this->shape());
+    auto tensor_one = tensor_factory()->
+                            template create<T>(this->shape());

-  assign_to_tensor(tensor_one.get(), (T)1);
+    assign_to_tensor(tensor_one.get(), (T) 1);
    ret->bitwise_xor(tensor_one.get(), ret);
 }

-template <typename T, size_t N>
-template <template <typename U, size_t...> class CTensor, size_t... N1>
-void FixedPointTensor<T, N>::eq(const CTensor<T, N1...> *rhs,
-                                BooleanTensor<T> *ret) const {
+template<typename T, size_t N>
+template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+void FixedPointTensor<T, N>::eq(const CTensor<T, N1...>* rhs,
+                                BooleanTensor<T>* ret) const {

    this->neq(rhs, ret);
    auto tensor_one = tensor_factory()->template create<T>(this->shape());
-  assign_to_tensor(tensor_one.get(), (T)1);
+    assign_to_tensor(tensor_one.get(), (T) 1);
    ret->bitwise_xor(tensor_one.get(), ret);
 }

-template <typename T, size_t N>
-template <template <typename U, size_t...> class CTensor, size_t... N1>
-void FixedPointTensor<T, N>::neq(const CTensor<T, N1...> *rhs,
-                                 BooleanTensor<T> *ret) const {
+template<typename T, size_t N>
+template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+void FixedPointTensor<T, N>::neq(const CTensor<T, N1...>* rhs,
+                                BooleanTensor<T>* ret) const {
    std::vector<std::shared_ptr<TensorAdapter<T>>> temp;
-  for (int i = 0; i < 4; i++) {
-    temp.emplace_back(tensor_factory()->template create<T>(this->shape()));
+    for (int i = 0; i < 4; i ++) {
+        temp.emplace_back(tensor_factory()->
+                                template create<T>(this->shape()));
    }
    std::shared_ptr<BooleanTensor<T>> lt =
-      std::make_shared<BooleanTensor<T>>(temp[0].get(), temp[1].get());
+            std::make_shared<BooleanTensor<T>>(
+                                temp[0].get(), temp[1].get());
    std::shared_ptr<BooleanTensor<T>> gt =
-      std::make_shared<BooleanTensor<T>>(temp[2].get(), temp[3].get());
+            std::make_shared<BooleanTensor<T>>(
+                                temp[2].get(), temp[3].get());

    this->lt(rhs, lt.get());
    this->gt(rhs, gt.get());
    lt->bitwise_or(gt.get(), ret);
 }

-template <typename T>
-inline void assign_to_tensor(TensorAdapter<T> *input, T assign_num) {
-  size_t size_one_dim = input->numel();
-  T *iter = input->data();
-  for (int i = 0; i < size_one_dim; ++i) {
-    *(iter + i) = assign_num;
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::reciprocal(const FixedPointTensor<T, N>* op, FixedPointTensor<T, N>* ret,
+                                        size_t iter, double x0) {
+    auto temp0 = tensor_factory()->template create<T>(ret->shape());
+    auto temp1 = tensor_factory()->template create<T>(ret->shape());
+    auto temp2 = tensor_factory()->template create<T>(ret->shape());
+    auto temp3 = tensor_factory()->template create<T>(ret->shape());
+    std::shared_ptr<FixedPointTensor<T, N>> result =
+        std::make_shared<FixedPointTensor<T, N>>(temp0.get(), temp1.get());
+    std::shared_ptr<FixedPointTensor<T, N>> x_copy =
+        std::make_shared<FixedPointTensor<T, N>>(temp2.get(), temp3.get());
+    assign_to_tensor(result->mutable_share(0), (T) 0);
+    assign_to_tensor(result->mutable_share(1), (T) 0);
+    auto tensor_x0 = tensor_factory()->template create<T>(op->shape());
+    assign_to_tensor(tensor_x0.get(), (T)(x0 * pow(2, N)));
+    tensor_x0->scaling_factor() = N;
+    result->add(tensor_x0.get(), result.get());
+    auto tensor_2 = tensor_factory()->template create<T>(op->shape());
+    tensor_2->scaling_factor() = N;
+    assign_to_tensor(tensor_2.get(), (T)(2 << N));
+    for (int i = 0; i < iter; ++i) {
+        result->share(0)->copy(x_copy->mutable_share(0));
+        result->share(1)->copy(x_copy->mutable_share(1));
+        auto res_ptr = result.get();
+        op->mul(res_ptr, res_ptr);
+        result->negative(res_ptr);
+        result->add(tensor_2.get(), res_ptr);
+        x_copy->mul(res_ptr, res_ptr);
    }
+    result->share(0)->copy(ret->mutable_share(0));
+    result->share(1)->copy(ret->mutable_share(1));
+}
+
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::inverse_square_root(FixedPointTensor* ret,
+                                                 size_t iter,
+                                                 double x0) const {
+    inverse_square_root(this, ret, iter, x0);
+}
+
+// Newton's method, var naming from Quake III Arena: Q_rsqrt
+// float threehalfs = 1.5F;
+// x2 = number * 0.5F;
+// y  = x0; // since 0x5f3759df does not fit fixed-point arithmetic
+// y  = y * ( threehalfs - ( x2 * y * y ) ); // iteration of Newton's method
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::inverse_square_root(const FixedPointTensor* op,
+                                                 FixedPointTensor* ret,
+                                                 size_t iter,
+                                                 double x0) {
+    std::vector<std::shared_ptr<TensorAdapter<T>>> temp;
+    for (int i = 0; i < 7; ++i) {
+        temp.emplace_back(
+            tensor_factory()->template create<T>(op->shape()));
+    }
+    std::shared_ptr<FixedPointTensor<T, N>> y =
+        std::make_shared<FixedPointTensor<T, N>>(temp[0].get(), temp[1].get());
+    std::shared_ptr<FixedPointTensor<T, N>> x2 =
+        std::make_shared<FixedPointTensor<T, N>>(temp[2].get(), temp[3].get());
+    // x2 = 0.5 * op
+    truncate3(op, x2.get(), 1);
+
+    assign_to_tensor(y->mutable_share(0), (T)(x0 * pow(2, N)));
+    assign_to_tensor(y->mutable_share(1), (T)(x0 * pow(2, N)));
+
+    // threehalfs
+    temp[4]->scaling_factor() = N;
+    assign_to_tensor(temp[4].get(), T(1.5 * pow(2, N)));
+
+    std::shared_ptr<FixedPointTensor<T, N>> y_copy =
+        std::make_shared<FixedPointTensor<T, N>>(temp[5].get(), temp[6].get());
+
+    for (int i = 0; i < iter; ++i) {
+        y->share(0)->copy(y_copy->mutable_share(0));
+        y->share(1)->copy(y_copy->mutable_share(1));
+        y->mul(y.get(), y.get());
+        y->mul(x2.get(), y.get());
+        y->negative(y.get());
+        y->add(temp[4].get(), y.get());
+        y_copy->mul(y.get(), y.get());
+    }
+    y->share(0)->copy(ret->mutable_share(0));
+    y->share(1)->copy(ret->mutable_share(1));
+}
+
+template<typename T, size_t N>
+template<template<typename U, size_t...> class CTensor,
+            size_t... N1>
+void FixedPointTensor<T, N>::max(const CTensor<T, N1...>* rhs,
+                                 FixedPointTensor* ret,
+                                 BooleanTensor<T>* cmp) const {
+    // max = lhs + (rhs - lhs) if rhs > lhs else lhs
+    std::vector<std::shared_ptr<TensorAdapter<T>>> temp;
+    bool output_cmp = cmp != nullptr;
+    // if cmp is not null, store cmp results in cmp
+    // else, store them in tmp tensors
+    for (int i = 0; i < 2 + 2 * (!output_cmp); ++i) {
+        temp.emplace_back(
+            tensor_factory()->template create<T>(this->shape()));
+    }
+    FixedPointTensor<T, N> delta(temp[0].get(), temp[1].get());
+    sub(rhs, &delta);
+    BooleanTensor<T> sign;
+    if (output_cmp) {
+        sign = *cmp;
+    } else {
+        sign = BooleanTensor<T>(temp[2].get(), temp[3].get());
+    }
+    sign.template bit_extract(sizeof(T) * 8 - 1, &delta);
+    delta.negative(&delta);
+    sign.mul(&delta, &delta);
+    add(&delta, ret);
+}
+
+template<typename T, size_t N>
+void FixedPointTensor<T, N>::max_pooling(FixedPointTensor* ret,
+                                         BooleanTensor<T>* pos) const {
+    size_t k = shape()[0];
+    std::vector<std::shared_ptr<TensorAdapter<T>>> tmp;
+    for (int i = 0; i < 4; ++i) {
+        tmp.emplace_back(
+            tensor_factory()->template create<T>());
+    }
+
+    FixedPointTensor now(tmp[0].get(), tmp[1].get());
+    BooleanTensor<T> cmp(tmp[2].get(), tmp[3].get());
+    auto cmp_ptr = pos ? &cmp : nullptr;
+
+    share(0)->slice(0, 1, tmp[0].get());
+    share(1)->slice(0, 1, tmp[1].get());
+
+    tmp[0]->copy(ret->mutable_share(0));
+    tmp[1]->copy(ret->mutable_share(1));
+
+    if (pos) {
+        pos->share(0)->slice(0, 1, tmp[2].get());
+        pos->share(1)->slice(0, 1, tmp[3].get());
+
+        // set init 1, slice_0 is larger than null
+        if (party() == 0 || party() == 2) {
+            size_t idx = 2 + (party() == 2);
+            assign_to_tensor(tmp[idx].get(), T(1));
+            assign_to_tensor(tmp[5 - idx].get(), T(0));
+        } else {
+            assign_to_tensor(tmp[2].get(), T(0));
+            assign_to_tensor(tmp[3].get(), T(0));
+        }
+
+    }
+
+    for (size_t i = 1; i < k; ++i) {
+        share(0)->slice(i, i + 1, tmp[0].get());
+        share(1)->slice(i, i + 1, tmp[1].get());
+
+        if (pos) {
+            pos->share(0)->slice(i, i + 1, tmp[2].get());
+            pos->share(1)->slice(i, i + 1, tmp[3].get());
+        }
+
+        ret->max(&now, ret, cmp_ptr);
+
+    }
+
+    if (pos) {
+        pos->onehot_from_cmp();
+    }
+
 }

 } // namespace aby3
--- a/core/privc3/fixedpoint_tensor_test.cc
+++ b/core/privc3/fixedpoint_tensor_test.cc
 /* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
    http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -10,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <string>
+#include <cmath>

 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -82,7 +86,7 @@ std::shared_ptr<TensorAdapter<int64_t>> gen(std::vector<size_t> shape) {
 }

 template<typename T, size_t N>
-PaddleTensor<T> test_fixedt_gen_paddle_tensor(std::vector<float>& input,
+PaddleTensor<T> test_fixedt_gen_paddle_tensor(std::vector<double>& input,
                        std::vector<size_t>& shape,
                        paddle::platform::CPUDeviceContext& cpu_ctx) {

@@ -96,13 +100,15 @@ PaddleTensor<T> test_fixedt_gen_paddle_tensor(std::vector<float>& input,
 }

 template<typename T>
-bool test_fixedt_check_tensor_eq(const TensorAdapter<T>* in1,
-                     const TensorAdapter<T>* in2, double precision = 0.0001) {
+bool test_fixedt_check_tensor_eq(const TensorAdapter<T>* result,
+                                 const TensorAdapter<T>* expected,
+                                 double precision = 0.0001,
+                                 bool use_relative_error = false) {
    // check shape
    std::vector<size_t> shape1, shape2;
-    shape1 = in1->shape();
-    shape2 = in2->shape();
-    size_t scale = in1->scaling_factor();
+    shape1 = result->shape();
+    shape2 = expected->shape();
+    size_t scale = result->scaling_factor();
    if (shape1.size() != shape2.size()) {
        std::cout << "shape size error: shape1.size: "<<shape1.size()<<
                     "; shape2.size: "<<shape2.size()<<std::endl;
@@ -116,15 +122,28 @@ bool test_fixedt_check_tensor_eq(const TensorAdapter<T>* in1,
    }

    // check each element
-    for (int i = 0; i < in1->numel(); i++) {
-        if (std::abs(*(in1->data() + i) - *(in2->data() + i)) >
-            precision * pow(2, scale)) {
-            std::cout << "result error: inx: "<<i<<
-                        " in1[i] = "<<*(in1->data() + i)<<
-                        " in2[i] = "<<*(in2->data() + i)<<std::endl;
-            return false;
-        }
-    }
+    bool return_false = false;
+    for (int i = 0; i < result->numel(); i++) {
+        // absolute error
+        if (!use_relative_error && std::abs(*(result->data() + i) - *(expected->data() + i)) >
+            precision * std::pow(2, scale)) {
+            std::cout << "result error: index: "<< i <<
+                        " output[i] = "<< *(result->data() + i) / pow(2, 16) <<
+                        " expected[i] = " << *(expected->data() + i) / pow(2, 16) << std::endl;
+            return_false = true;
+        }
+        // relative error
+        if (use_relative_error
+            && std::abs(*(result->data() + i) - *(expected->data() + i))
+            / (std::abs(*(expected->data() + i))  + 0.00000001)
+            > precision) {
+            std::cout << "result error: index: "<< i <<
+                        " output[i] = " << *(result->data() + i) / pow(2, 16) <<
+                        " expected[i] = " << *(expected->data() + i) / pow(2, 16) << std::endl;
+            return_false = true;
+        }
+    }
+    if (return_false) return false;
    return true;
 }

@@ -338,23 +357,23 @@ void test_fixedt_mul_fixed(size_t p,
    result->reveal(out);
 }

-void test_fixedt_mul2_fixed(size_t p,
+void test_fixedt_mul_plain(size_t p,
               std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in,
               TensorAdapter<int64_t>* out) {
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> temp;
-    for (int i = 0; i < 6; i++) {
+    for (int i = 0; i < 4; i++) {
        temp.emplace_back(gen(out->shape()));
    }

-    test_fixedt_gen_shares(p, in, temp);
+    test_fixedt_gen_shares(p, in[0], temp);
+
    Fix64N16* lhs = new Fix64N16(temp[0].get(), temp[1].get());
-    Fix64N16* rhs = new Fix64N16(temp[2].get(), temp[3].get());
-    Fix64N16* result = new Fix64N16(temp[4].get(), temp[5].get());
-    lhs->mul2(rhs, result);
+    Fix64N16* result = new Fix64N16(temp[2].get(), temp[3].get());
+    lhs->mul(in[1].get(), result);
    result->reveal(out);
 }

-void test_fixedt_mul_plain(size_t p,
+void test_fixedt_div_plain(size_t p,
               std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in,
               TensorAdapter<int64_t>* out) {
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> temp;
@@ -366,23 +385,23 @@ void test_fixedt_mul_plain(size_t p,

    Fix64N16* lhs = new Fix64N16(temp[0].get(), temp[1].get());
    Fix64N16* result = new Fix64N16(temp[2].get(), temp[3].get());
-    lhs->mul(in[1].get(), result);
+    lhs->div(in[1].get(), result);
    result->reveal(out);
 }

-void test_fixedt_div_plain(size_t p,
+void test_fixedt_div_fixed(size_t p,
               std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in,
               TensorAdapter<int64_t>* out) {
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> temp;
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < 6; i++) {
        temp.emplace_back(gen(out->shape()));
    }

-    test_fixedt_gen_shares(p, in[0], temp);
-
+    test_fixedt_gen_shares(p, in, temp);
    Fix64N16* lhs = new Fix64N16(temp[0].get(), temp[1].get());
-    Fix64N16* result = new Fix64N16(temp[2].get(), temp[3].get());
-    lhs->div(in[1].get(), result);
+    Fix64N16* rhs = new Fix64N16(temp[2].get(), temp[3].get());
+    Fix64N16* result = new Fix64N16(temp[4].get(), temp[5].get());
+    lhs->div(rhs, result);
    result->reveal(out);
 }

@@ -496,6 +515,22 @@ void test_fixedt_relu_fixed(size_t p,
    result->reveal(out);
 }

+void test_fixedt_relu2_fixed(size_t p,
+               std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in,
+               TensorAdapter<int64_t>* out) {
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> temp;
+    for (int i = 0; i < 4; i++) {
+        temp.emplace_back(gen(out->shape()));
+    }
+
+    test_fixedt_gen_shares(p, in[0], temp);
+
+    Fix64N16* lhs = new Fix64N16(temp[0].get(), temp[1].get());
+    Fix64N16* result = new Fix64N16(temp[2].get(), temp[3].get());
+    lhs->relu_with_derivative(result, nullptr);
+    result->reveal(out);
+}
+
 void test_fixedt_softmax_fixed(size_t p,
               std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in,
               TensorAdapter<int64_t>* out) {
@@ -528,7 +563,7 @@ void test_fixedt_sigmoid_fixed(size_t p,
    result->reveal(out);
 }

-void test_fixedt_exp_fixed(size_t p,
+void test_fixedt_sigmoid_enhanced_fixed(size_t p,
               std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in,
               TensorAdapter<int64_t>* out) {
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> temp;
@@ -540,23 +575,39 @@ void test_fixedt_exp_fixed(size_t p,

    Fix64N16* lhs = new Fix64N16(temp[0].get(), temp[1].get());
    Fix64N16* result = new Fix64N16(temp[2].get(), temp[3].get());
-    lhs->exp(result);
+    lhs->sigmoid_enhanced(result);
    result->reveal(out);
 }

-void test_fixedt_mat_mul_fixed(size_t p,
+void test_fixedt_sigmoid_chebyshev_fixed(size_t p,
               std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in,
               TensorAdapter<int64_t>* out) {
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> temp;
-    for (int i = 0; i < 6; i++) {
+    for (int i = 0; i < 4; i++) {
        temp.emplace_back(gen(out->shape()));
    }

-    test_fixedt_gen_shares(p, in, temp);
+    test_fixedt_gen_shares(p, in[0], temp);
+
    Fix64N16* lhs = new Fix64N16(temp[0].get(), temp[1].get());
-    Fix64N16* rhs = new Fix64N16(temp[2].get(), temp[3].get());
-    Fix64N16* result = new Fix64N16(temp[4].get(), temp[5].get());
-    lhs->mat_mul(rhs, result);
+    Fix64N16* result = new Fix64N16(temp[2].get(), temp[3].get());
+    lhs->sigmoid_chebyshev(result);
+    result->reveal(out);
+}
+
+void test_fixedt_exp_fixed(size_t p,
+               std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in,
+               TensorAdapter<int64_t>* out) {
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> temp;
+    for (int i = 0; i < 4; i++) {
+        temp.emplace_back(gen(out->shape()));
+    }
+
+    test_fixedt_gen_shares(p, in[0], temp);
+
+    Fix64N16* lhs = new Fix64N16(temp[0].get(), temp[1].get());
+    Fix64N16* result = new Fix64N16(temp[2].get(), temp[3].get());
+    lhs->exp(result);
    result->reveal(out);
 }

@@ -829,7 +880,13 @@ void test_fixedt_matmul_fixed(size_t p,
               std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in,
               TensorAdapter<int64_t>* out) {
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> temp;
-    for (int i = 0; i < 6; i++) {
+    for (int i = 0; i < 2; i++) {
+        temp.emplace_back(gen(in[0]->shape()));
+    }
+    for (int i = 2; i < 4; i++) {
+        temp.emplace_back(gen(in[1]->shape()));
+    }
+    for (int i = 4; i < 6; i++) {
        temp.emplace_back(gen(out->shape()));
    }

@@ -843,24 +900,26 @@ void test_fixedt_matmul_fixed(size_t p,

 TEST_F(FixedTensorTest, matmulfixed) {

-    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {4.0, 4.0, 4.0, 4.0};
+    std::vector<size_t> shape = {1, 3};
+    std::vector<size_t> shape1 = {3, 1};
+    std::vector<size_t> shape_o = {1, 1};
+    std::vector<double> in0_val = {1, 0, 0};
+    std::vector<double> in1_val = {1, 2, 3};
+    std::vector<double> res_val = {1};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
-                            {gen(shape), gen(shape)};
+                            {gen(shape), gen(shape1)};

    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
                                shape, _cpu_ctx).copy(in[0].get());
    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
-                                shape, _cpu_ctx).copy(in[1].get());
+                                shape1, _cpu_ctx).copy(in[1].get());

-    auto out0 = _s_tensor_factory->create<int64_t>(shape);
-    auto out1 = _s_tensor_factory->create<int64_t>(shape);
-    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+    auto out0 = _s_tensor_factory->create<int64_t>(shape_o);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape_o);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape_o);

    PaddleTensor<int64_t> result =
-            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape_o, _cpu_ctx);

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
@@ -892,7 +951,7 @@ TEST_F(FixedTensorTest, matmulfixed) {

 TEST_F(FixedTensorTest, share) {
    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in_val = {1.0, 1.0, 1.0, 1.0};
+    std::vector<double> in_val = {1.0, 1.0, 1.0, 1.0};
    PaddleTensor<int64_t> input =
            test_fixedt_gen_paddle_tensor<int64_t, 16>(in_val, shape, _cpu_ctx);
    auto output = _s_tensor_factory->create<int64_t>(shape);
@@ -930,9 +989,9 @@ TEST_F(FixedTensorTest, share) {
 TEST_F(FixedTensorTest, addfixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {3.0, 3.0, 3.0, 3.0};
+    std::vector<double> in0_val = {0x1p47 - 1, 5+2^-16, 1.0, 1.0};
+    std::vector<double> in1_val = {1.0, 8+(1-2^-16), 2.0, 2.0};
+    std::vector<double> res_val = {-0x1p47, 14, 3.0, 3.0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -975,9 +1034,9 @@ TEST_F(FixedTensorTest, addfixed) {

 TEST_F(FixedTensorTest, addplain) {
    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {3.0, 3.0, 3.0, 3.0};
+    std::vector<double> in0_val = {1.0, 5+2^-16, 1.0, 1.0};
+    std::vector<double> in1_val = {0x1p47 - 1, 8+(1-2^-16), 2.0, 2.0};
+    std::vector<double> res_val = {-0x1p47, 14.0, 3.0, 3.0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1023,9 +1082,9 @@ TEST_F(FixedTensorTest, addplain) {
 TEST_F(FixedTensorTest, subfixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {3.0, 3.0, 3.0, 3.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {1.0, 1.0, 1.0, 1.0};
+    std::vector<double> in0_val = {3.0, 3.0, 3.0, 3.0};
+    std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {1.0, 1.0, 1.0, 1.0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1069,9 +1128,9 @@ TEST_F(FixedTensorTest, subfixed) {
 TEST_F(FixedTensorTest, subplain) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {3.0, 3.0, 3.0, 3.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {1.0, 1.0, 1.0, 1.0};
+    std::vector<double> in0_val = {3.0, 3.0, 3.0, 3.0};
+    std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {1.0, 1.0, 1.0, 1.0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1117,9 +1176,9 @@ TEST_F(FixedTensorTest, subplain) {
 TEST_F(FixedTensorTest, negfixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    //std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {-1.0, -1.0, -1.0, -1.0};
+    std::vector<double> in0_val = {1.0, 1.0, 1.0, 1.0};
+    //std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {-1.0, -1.0, -1.0, -1.0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};

    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
@@ -1163,9 +1222,10 @@ TEST_F(FixedTensorTest, negfixed) {
 TEST_F(FixedTensorTest, mulfixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> in0_val = {1.0, 1.0, 1.0, 1.0};
+    std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {2.0, 2.0, 2.0, 2.0};
+
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1209,12 +1269,24 @@ TEST_F(FixedTensorTest, mulfixed) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, mul2fixed) {
+TEST_F(FixedTensorTest, mulfixed_multi_times) {

-    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<size_t> shape = {100000, 1};
+    std::vector<double> in0_val(shape[0]), in1_val(shape[0]), res_val(shape[0]);
+
+    auto fill_mul_data = [&in0_val, &in1_val, &res_val] () {
+        unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+        std::default_random_engine generator(seed);
+
+        std::uniform_int_distribution<int64_t> input(-0x1p36, 0x1p36);
+        std::for_each(in0_val.begin(), in0_val.end(),
+                        [] (double& a){ a = 1.0;});
+        std::for_each(in1_val.begin(), in1_val.end(),
+                        [&input, &generator] (double& a){ a = input(generator) * pow(2, -16);});
+        std::transform(in0_val.begin(), in0_val.end(), in1_val.begin(), res_val.begin(),
+                        [] (double& a, double& b){ return a * b;});
+        };
+    fill_mul_data();
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1232,19 +1304,19 @@ TEST_F(FixedTensorTest, mul2fixed) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_mul2_fixed(0, in, out0.get());
+            test_fixedt_mul_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_mul2_fixed(1, in, out1.get());
+            test_fixedt_mul_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_mul2_fixed(2, in, out2.get());
+            test_fixedt_mul_fixed(2, in, out2.get());
        });

    });
@@ -1258,12 +1330,15 @@ TEST_F(FixedTensorTest, mul2fixed) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, mulplain) {
+TEST_F(FixedTensorTest, mulfixed_overflow) {

-    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<size_t> shape = {1};
+    // result greater than 2^32 is overflow
+    // notice: multiplier larger than 2^20 may lead to error result
+    // as 2^l << 2^k [ stated in ABY3]
+    std::vector<double> in0_val = {0x1p16};
+    std::vector<double> in1_val = {0x1p16};
+    std::vector<double> res_val = {0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1271,33 +1346,29 @@ TEST_F(FixedTensorTest, mulplain) {
                                shape, _cpu_ctx).copy(in[0].get());
    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
                                shape, _cpu_ctx).copy(in[1].get());
-    //not copy scaling factor in copy funtion
-    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
-                                scaling_factor() = 16;
-    dynamic_cast<PaddleTensor<int64_t>*>(in[1].get())->
-                                scaling_factor() = 16;

    auto out0 = _s_tensor_factory->create<int64_t>(shape);
    auto out1 = _s_tensor_factory->create<int64_t>(shape);
    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
    PaddleTensor<int64_t> result =
            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_mul_plain(0, in, out0.get());
+            test_fixedt_mul_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_mul_plain(1, in, out1.get());
+            test_fixedt_mul_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_mul_plain(2, in, out2.get());
+            test_fixedt_mul_fixed(2, in, out2.get());
        });

    });
@@ -1311,12 +1382,15 @@ TEST_F(FixedTensorTest, mulplain) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, divplain) {
+TEST_F(FixedTensorTest, mulfixed_upper_bound) {

-    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {0.5, 0.5, 0.5, 0.5};
+    std::vector<size_t> shape = {1, 2};
+    // recommend each input less than 2^20
+    // larger than 2^20 may lead to error result
+    // as 2^l << 2^k [stated in ABY3]
+    std::vector<double> in0_val = {1.0, 1.0};
+    std::vector<double> in1_val = {0x1p20, -0x1p20};
+    std::vector<double> res_val = {0x1p20, -0x1p20};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1324,33 +1398,29 @@ TEST_F(FixedTensorTest, divplain) {
                                shape, _cpu_ctx).copy(in[0].get());
    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
                                shape, _cpu_ctx).copy(in[1].get());
-    //not copy scaling factor in copy funtion
-    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
-                                scaling_factor() = 16;
-    dynamic_cast<PaddleTensor<int64_t>*>(in[1].get())->
-                                scaling_factor() = 16;

    auto out0 = _s_tensor_factory->create<int64_t>(shape);
    auto out1 = _s_tensor_factory->create<int64_t>(shape);
    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
    PaddleTensor<int64_t> result =
            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_div_plain(0, in, out0.get());
+            test_fixedt_mul_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_div_plain(1, in, out1.get());
+            test_fixedt_mul_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_div_plain(2, in, out2.get());
+            test_fixedt_mul_fixed(2, in, out2.get());
        });

    });
@@ -1364,45 +1434,42 @@ TEST_F(FixedTensorTest, divplain) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, sum) {
+TEST_F(FixedTensorTest, mulfixed_low_bound) {

-    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    //std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {4.0};
-    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+    std::vector<size_t> shape = {1};
+    std::vector<double> in0_val = {1.0};
+    std::vector<double> in1_val = {0x1p-16};
+    std::vector<double> res_val = {0};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
+                            {gen(shape), gen(shape)};

    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
                                shape, _cpu_ctx).copy(in[0].get());
-    //not copy scaling factor in copy funtion
-    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
-                                scaling_factor() = 16;
-
-    std::vector<size_t> ret_shape = {1};
-    auto out0 = _s_tensor_factory->create<int64_t>(ret_shape);
-    auto out1 = _s_tensor_factory->create<int64_t>(ret_shape);
-    auto out2 = _s_tensor_factory->create<int64_t>(ret_shape);
-
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
+                                shape, _cpu_ctx).copy(in[1].get());

+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);

    PaddleTensor<int64_t> result =
-            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, ret_shape, _cpu_ctx);
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_sum_fixed(0, in, out0.get());
+            test_fixedt_mul_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_sum_fixed(1, in, out1.get());
+            test_fixedt_mul_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_sum_fixed(2, in, out2.get());
+            test_fixedt_mul_fixed(2, in, out2.get());
        });

    });
@@ -1416,12 +1483,12 @@ TEST_F(FixedTensorTest, sum) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, mat_mulfixed) {
+TEST_F(FixedTensorTest, mulplain) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {4.0, 4.0, 4.0, 4.0};
+    std::vector<double> in0_val = {1.0, 1.0, 1.0, 1.0};
+    std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {2.0, 2.0, 2.0, 2.0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1429,29 +1496,33 @@ TEST_F(FixedTensorTest, mat_mulfixed) {
                                shape, _cpu_ctx).copy(in[0].get());
    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
                                shape, _cpu_ctx).copy(in[1].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+    dynamic_cast<PaddleTensor<int64_t>*>(in[1].get())->
+                                scaling_factor() = 16;

    auto out0 = _s_tensor_factory->create<int64_t>(shape);
    auto out1 = _s_tensor_factory->create<int64_t>(shape);
    auto out2 = _s_tensor_factory->create<int64_t>(shape);
-
    PaddleTensor<int64_t> result =
            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_mat_mul_fixed(0, in, out0.get());
+            test_fixedt_mul_plain(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_mat_mul_fixed(1, in, out1.get());
+            test_fixedt_mul_plain(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_mat_mul_fixed(2, in, out2.get());
+            test_fixedt_mul_plain(2, in, out2.get());
        });

    });
@@ -1465,12 +1536,12 @@ TEST_F(FixedTensorTest, mat_mulfixed) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, mat_mulplain) {
+TEST_F(FixedTensorTest, divplain) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {4.0, 4.0, 4.0, 4.0};
+    std::vector<double> in0_val = {1.0, 1.0, 1.0, 1.0};
+    std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {0.5, 0.5, 0.5, 0.5};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1492,19 +1563,19 @@ TEST_F(FixedTensorTest, mat_mulplain) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_mat_mul_plain(0, in, out0.get());
+            test_fixedt_div_plain(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_mat_mul_plain(1, in, out1.get());
+            test_fixedt_div_plain(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_mat_mul_plain(2, in, out2.get());
+            test_fixedt_div_plain(2, in, out2.get());
        });

    });
@@ -1518,12 +1589,12 @@ TEST_F(FixedTensorTest, mat_mulplain) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, dot_mul_fixed) {
+TEST_F(FixedTensorTest, divfixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {8.0};
+    std::vector<double> in0_val = {1.0, 1.0, 1.0, 1.0};
+    std::vector<double> in1_val = {1.0, 10.0, 1000.0, 700.0};
+    std::vector<double> res_val = {1.0, 0.1, 0.001, 1.0 / 700};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1531,35 +1602,29 @@ TEST_F(FixedTensorTest, dot_mul_fixed) {
                                shape, _cpu_ctx).copy(in[0].get());
    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
                                shape, _cpu_ctx).copy(in[1].get());
-    //not copy scaling factor in copy funtion
-    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
-                                scaling_factor() = 16;
-
-    std::vector<size_t> ret_shape = {1};
-    auto out0 = _s_tensor_factory->create<int64_t>(ret_shape);
-    auto out1 = _s_tensor_factory->create<int64_t>(ret_shape);
-    auto out2 = _s_tensor_factory->create<int64_t>(ret_shape);
-

+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);

    PaddleTensor<int64_t> result =
-            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, ret_shape, _cpu_ctx);
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_dot_mul_fixed(0, in, out0.get());
+            test_fixedt_div_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_dot_mul_fixed(1, in, out1.get());
+            test_fixedt_div_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_dot_mul_fixed(2, in, out2.get());
+            test_fixedt_div_fixed(2, in, out2.get());
        });

    });
@@ -1570,15 +1635,16 @@ TEST_F(FixedTensorTest, dot_mul_fixed) {

    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result, 0.2, true));
 }

-TEST_F(FixedTensorTest, dot_mul_plain) {
+TEST_F(FixedTensorTest, divfixed_low_bound) {

-    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1.0, 1.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {8.0};
+    std::vector<size_t> shape = {1};
+    std::vector<double> in0_val = {1.0};
+    // divisor > 1/x0, default x0 = 2^-15
+    std::vector<double> in1_val = {0x1p15};
+    std::vector<double> res_val = {0x1p-15};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1586,37 +1652,29 @@ TEST_F(FixedTensorTest, dot_mul_plain) {
                                shape, _cpu_ctx).copy(in[0].get());
    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
                                shape, _cpu_ctx).copy(in[1].get());
-    //not copy scaling factor in copy funtion
-    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
-                                scaling_factor() = 16;
-    dynamic_cast<PaddleTensor<int64_t>*>(in[1].get())->
-                                scaling_factor() = 16;
-
-    std::vector<size_t> ret_shape = {1};
-    auto out0 = _s_tensor_factory->create<int64_t>(ret_shape);
-    auto out1 = _s_tensor_factory->create<int64_t>(ret_shape);
-    auto out2 = _s_tensor_factory->create<int64_t>(ret_shape);
-

+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);

    PaddleTensor<int64_t> result =
-            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, ret_shape, _cpu_ctx);
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_dot_mul_plain(0, in, out0.get());
+            test_fixedt_div_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_dot_mul_plain(1, in, out1.get());
+            test_fixedt_div_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_dot_mul_plain(2, in, out2.get());
+            test_fixedt_div_fixed(2, in, out2.get());
        });

    });
@@ -1627,49 +1685,48 @@ TEST_F(FixedTensorTest, dot_mul_plain) {

    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result, 0.0001, true));
 }

-TEST_F(FixedTensorTest, gt_plain) {
+TEST_F(FixedTensorTest, sum) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {3.0, 3.0, 3.0, 3.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 1 / pow(2, 16), 1 / pow(2, 16)};
-    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
-                            {gen(shape), gen(shape)};
+    std::vector<double> in0_val = {1.0, 1.0, 1.0, 1.0};
+    //std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {4.0};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};

    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
                                shape, _cpu_ctx).copy(in[0].get());
-    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
-                                shape, _cpu_ctx).copy(in[1].get());
    //not copy scaling factor in copy funtion
    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
                                scaling_factor() = 16;
-    dynamic_cast<PaddleTensor<int64_t>*>(in[1].get())->
-                                scaling_factor() = 16;

-    auto out0 = _s_tensor_factory->create<int64_t>(shape);
-    auto out1 = _s_tensor_factory->create<int64_t>(shape);
-    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+    std::vector<size_t> ret_shape = {1};
+    auto out0 = _s_tensor_factory->create<int64_t>(ret_shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(ret_shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(ret_shape);
+
+
+
    PaddleTensor<int64_t> result =
-            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, ret_shape, _cpu_ctx);

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_gt_plain(0, in, out0.get());
+            test_fixedt_sum_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_gt_plain(1, in, out1.get());
+            test_fixedt_sum_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_gt_plain(2, in, out2.get());
+            test_fixedt_sum_fixed(2, in, out2.get());
        });

    });
@@ -1683,12 +1740,12 @@ TEST_F(FixedTensorTest, gt_plain) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, gt_fixed) {
+TEST_F(FixedTensorTest, mat_mulplain) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {3.0, 3.0, 3.0, 3.0};
-    std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 1 / pow(2, 16), 1 / pow(2, 16)};
+    std::vector<double> in0_val = {1.0, 1.0, 1.0, 1.0};
+    std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {4.0, 4.0, 4.0, 4.0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1710,19 +1767,19 @@ TEST_F(FixedTensorTest, gt_fixed) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_gt_fixed(0, in, out0.get());
+            test_fixedt_mat_mul_plain(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_gt_fixed(1, in, out1.get());
+            test_fixedt_mat_mul_plain(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_gt_fixed(2, in, out2.get());
+            test_fixedt_mat_mul_plain(2, in, out2.get());
        });

    });
@@ -1736,12 +1793,12 @@ TEST_F(FixedTensorTest, gt_fixed) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, lt_plain) {
+TEST_F(FixedTensorTest, dot_mul_fixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {2.0, 2.0, 3.0, 3.0};
-    std::vector<float> in1_val = {3.0, 3.0, 2.0, 2.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<double> in0_val = {1.0, 1.0, 1.0, 1.0};
+    std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {8.0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1752,30 +1809,32 @@ TEST_F(FixedTensorTest, lt_plain) {
    //not copy scaling factor in copy funtion
    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
                                scaling_factor() = 16;
-    dynamic_cast<PaddleTensor<int64_t>*>(in[1].get())->
-                                scaling_factor() = 16;

-    auto out0 = _s_tensor_factory->create<int64_t>(shape);
-    auto out1 = _s_tensor_factory->create<int64_t>(shape);
-    auto out2 = _s_tensor_factory->create<int64_t>(shape);
-    PaddleTensor<int64_t> result =
-            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+    std::vector<size_t> ret_shape = {1};
+    auto out0 = _s_tensor_factory->create<int64_t>(ret_shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(ret_shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(ret_shape);

-    _t[0] = std::thread([this, in, out0]() mutable {
-        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_lt_plain(0, in, out0.get());
+
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, ret_shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_dot_mul_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_lt_plain(1, in, out1.get());
+            test_fixedt_dot_mul_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_lt_plain(2, in, out2.get());
+            test_fixedt_dot_mul_fixed(2, in, out2.get());
        });

    });
@@ -1789,12 +1848,69 @@ TEST_F(FixedTensorTest, lt_plain) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, lt_fixed) {
+TEST_F(FixedTensorTest, dot_mul_plain) {
+
+    std::vector<size_t> shape = {2, 2};
+    std::vector<double> in0_val = {1.0, 1.0, 1.0, 1.0};
+    std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {8.0};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
+                            {gen(shape), gen(shape)};
+
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
+                                shape, _cpu_ctx).copy(in[0].get());
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
+                                shape, _cpu_ctx).copy(in[1].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+    dynamic_cast<PaddleTensor<int64_t>*>(in[1].get())->
+                                scaling_factor() = 16;
+
+    std::vector<size_t> ret_shape = {1};
+    auto out0 = _s_tensor_factory->create<int64_t>(ret_shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(ret_shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(ret_shape);
+
+
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, ret_shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_dot_mul_plain(0, in, out0.get());
+        });
+
+    });
+    _t[1] = std::thread([this, in, out1]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
+            test_fixedt_dot_mul_plain(1, in, out1.get());
+        });
+
+    });
+    _t[2] = std::thread([this, in, out2]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
+            test_fixedt_dot_mul_plain(2, in, out2.get());
+        });
+
+    });
+
+    _t[0].join();
+    _t[1].join();
+    _t[2].join();
+
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
+}
+
+TEST_F(FixedTensorTest, gt_plain) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {2.0, 2.0, 3.0, 3.0};
-    std::vector<float> in1_val = {3.0, 3.0, 2.0, 2.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<double> in0_val = {3.0, 3.0, 3.0, 3.0};
+    std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 1 / pow(2, 16), 1 / pow(2, 16)};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1816,19 +1932,19 @@ TEST_F(FixedTensorTest, lt_fixed) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_lt_fixed(0, in, out0.get());
+            test_fixedt_gt_plain(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_lt_fixed(1, in, out1.get());
+            test_fixedt_gt_plain(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_lt_fixed(2, in, out2.get());
+            test_fixedt_gt_plain(2, in, out2.get());
        });

    });
@@ -1842,12 +1958,12 @@ TEST_F(FixedTensorTest, lt_fixed) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, leq_plain) {
+TEST_F(FixedTensorTest, gt_fixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {2.0, 3.0, 3.0, 3.0};
-    std::vector<float> in1_val = {3.0, 3.0, 2.0, 2.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<double> in0_val = {3.0, 3.0, 3.0, 3.0};
+    std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 1 / pow(2, 16), 1 / pow(2, 16)};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1869,19 +1985,19 @@ TEST_F(FixedTensorTest, leq_plain) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_leq_plain(0, in, out0.get());
+            test_fixedt_gt_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_leq_plain(1, in, out1.get());
+            test_fixedt_gt_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_leq_plain(2, in, out2.get());
+            test_fixedt_gt_fixed(2, in, out2.get());
        });

    });
@@ -1895,12 +2011,12 @@ TEST_F(FixedTensorTest, leq_plain) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, leq_fixed) {
+TEST_F(FixedTensorTest, lt_plain) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {2.0, 3.0, 3.0, 3.0};
-    std::vector<float> in1_val = {3.0, 3.0, 2.0, 2.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<double> in0_val = {2.0, 2.0, 3.0, 3.0};
+    std::vector<double> in1_val = {3.0, 3.0, 2.0, 2.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1922,19 +2038,19 @@ TEST_F(FixedTensorTest, leq_fixed) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_leq_fixed(0, in, out0.get());
+            test_fixedt_lt_plain(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_leq_fixed(1, in, out1.get());
+            test_fixedt_lt_plain(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_leq_fixed(2, in, out2.get());
+            test_fixedt_lt_plain(2, in, out2.get());
        });

    });
@@ -1948,12 +2064,12 @@ TEST_F(FixedTensorTest, leq_fixed) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, geq_plain) {
+TEST_F(FixedTensorTest, lt_fixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {3.0, 3.0, 2.0, 2.0};
-    std::vector<float> in1_val = {2.0, 3.0, 3.0, 3.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<double> in0_val = {2.0, 2.0, 3.0, 3.0};
+    std::vector<double> in1_val = {3.0, 3.0, 2.0, 2.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -1975,19 +2091,19 @@ TEST_F(FixedTensorTest, geq_plain) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_geq_plain(0, in, out0.get());
+            test_fixedt_lt_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_geq_plain(1, in, out1.get());
+            test_fixedt_lt_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_geq_plain(2, in, out2.get());
+            test_fixedt_lt_fixed(2, in, out2.get());
        });

    });
@@ -2001,12 +2117,12 @@ TEST_F(FixedTensorTest, geq_plain) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, geq_fixed) {
+TEST_F(FixedTensorTest, leq_plain) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {3.0, 3.0, 2.0, 2.0};
-    std::vector<float> in1_val = {2.0, 3.0, 3.0, 3.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<double> in0_val = {2.0, 3.0, 3.0, 3.0};
+    std::vector<double> in1_val = {3.0, 3.0, 2.0, 2.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -2028,19 +2144,19 @@ TEST_F(FixedTensorTest, geq_fixed) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_geq_fixed(0, in, out0.get());
+            test_fixedt_leq_plain(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_geq_fixed(1, in, out1.get());
+            test_fixedt_leq_plain(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_geq_fixed(2, in, out2.get());
+            test_fixedt_leq_plain(2, in, out2.get());
        });

    });
@@ -2054,12 +2170,12 @@ TEST_F(FixedTensorTest, geq_fixed) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, eq_plain) {
+TEST_F(FixedTensorTest, leq_fixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {3.0, 3.0, 2.0, 3.0};
-    std::vector<float> in1_val = {3.0, 3.0, 3.0, 2.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<double> in0_val = {2.0, 3.0, 3.0, 3.0};
+    std::vector<double> in1_val = {3.0, 3.0, 2.0, 2.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -2081,19 +2197,19 @@ TEST_F(FixedTensorTest, eq_plain) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_eq_plain(0, in, out0.get());
+            test_fixedt_leq_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_eq_plain(1, in, out1.get());
+            test_fixedt_leq_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_eq_plain(2, in, out2.get());
+            test_fixedt_leq_fixed(2, in, out2.get());
        });

    });
@@ -2107,12 +2223,12 @@ TEST_F(FixedTensorTest, eq_plain) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, eq_fixed) {
+TEST_F(FixedTensorTest, geq_plain) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {3.0, 3.0, 2.0, 3.0};
-    std::vector<float> in1_val = {3.0, 3.0, 3.0, 2.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<double> in0_val = {3.0, 3.0, 2.0, 2.0};
+    std::vector<double> in1_val = {2.0, 3.0, 3.0, 3.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -2134,19 +2250,19 @@ TEST_F(FixedTensorTest, eq_fixed) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_eq_fixed(0, in, out0.get());
+            test_fixedt_geq_plain(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_eq_fixed(1, in, out1.get());
+            test_fixedt_geq_plain(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_eq_fixed(2, in, out2.get());
+            test_fixedt_geq_plain(2, in, out2.get());
        });

    });
@@ -2160,12 +2276,12 @@ TEST_F(FixedTensorTest, eq_fixed) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, neq_plain) {
+TEST_F(FixedTensorTest, geq_fixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {2.0, 3.0, 3.0, 3.0};
-    std::vector<float> in1_val = {3.0, 2.0, 3.0, 3.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<double> in0_val = {3.0, 3.0, 2.0, 2.0};
+    std::vector<double> in1_val = {2.0, 3.0, 3.0, 3.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -2187,19 +2303,19 @@ TEST_F(FixedTensorTest, neq_plain) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_neq_plain(0, in, out0.get());
+            test_fixedt_geq_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_neq_plain(1, in, out1.get());
+            test_fixedt_geq_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_neq_plain(2, in, out2.get());
+            test_fixedt_geq_fixed(2, in, out2.get());
        });

    });
@@ -2213,12 +2329,12 @@ TEST_F(FixedTensorTest, neq_plain) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, neq_fixed) {
+TEST_F(FixedTensorTest, eq_plain) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {3.0, 2.0, 3.0, 3.0};
-    std::vector<float> in1_val = {2.0, 3.0, 3.0, 3.0};
-    std::vector<float> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<double> in0_val = {3.0, 3.0, 2.0, 3.0};
+    std::vector<double> in1_val = {3.0, 3.0, 3.0, 2.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
                            {gen(shape), gen(shape)};

@@ -2240,19 +2356,19 @@ TEST_F(FixedTensorTest, neq_fixed) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_neq_fixed(0, in, out0.get());
+            test_fixedt_eq_plain(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_neq_fixed(1, in, out1.get());
+            test_fixedt_eq_plain(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_neq_fixed(2, in, out2.get());
+            test_fixedt_eq_plain(2, in, out2.get());
        });

    });
@@ -2266,42 +2382,46 @@ TEST_F(FixedTensorTest, neq_fixed) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, exp_fixed) {
+TEST_F(FixedTensorTest, eq_fixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {0.0, 0.0, 1.0, 1.0};
-    //std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {1.0, 1.0, 2.7183, 2.7183};
-    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+    std::vector<double> in0_val = {3.0, 3.0, 2.0, 3.0};
+    std::vector<double> in1_val = {3.0, 3.0, 3.0, 2.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
+                            {gen(shape), gen(shape)};

    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
                                shape, _cpu_ctx).copy(in[0].get());
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
+                                shape, _cpu_ctx).copy(in[1].get());
    //not copy scaling factor in copy funtion
    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
                                scaling_factor() = 16;
+    dynamic_cast<PaddleTensor<int64_t>*>(in[1].get())->
+                                scaling_factor() = 16;

    auto out0 = _s_tensor_factory->create<int64_t>(shape);
    auto out1 = _s_tensor_factory->create<int64_t>(shape);
    auto out2 = _s_tensor_factory->create<int64_t>(shape);
-
    PaddleTensor<int64_t> result =
            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_exp_fixed(0, in, out0.get());
+            test_fixedt_eq_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_exp_fixed(1, in, out1.get());
+            test_fixedt_eq_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_exp_fixed(2, in, out2.get());
+            test_fixedt_eq_fixed(2, in, out2.get());
        });

    });
@@ -2310,47 +2430,51 @@ TEST_F(FixedTensorTest, exp_fixed) {
    _t[1].join();
    _t[2].join();

-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get(), 0.1));
-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get(), 0.1));
-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result, 0.1));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, polynomial) {
-    // y = 1 + x
+TEST_F(FixedTensorTest, neq_plain) {
+
    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {-1.0, 2.0, 2.0, 2.0};
-    //std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {0.0, 3.0, 3.0, 3.0};
-    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+    std::vector<double> in0_val = {2.0, 3.0, 3.0, 3.0};
+    std::vector<double> in1_val = {3.0, 2.0, 3.0, 3.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
+                            {gen(shape), gen(shape)};

    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
                                shape, _cpu_ctx).copy(in[0].get());
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
+                                shape, _cpu_ctx).copy(in[1].get());
    //not copy scaling factor in copy funtion
    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
                                scaling_factor() = 16;
+    dynamic_cast<PaddleTensor<int64_t>*>(in[1].get())->
+                                scaling_factor() = 16;

    auto out0 = _s_tensor_factory->create<int64_t>(shape);
    auto out1 = _s_tensor_factory->create<int64_t>(shape);
    auto out2 = _s_tensor_factory->create<int64_t>(shape);
-
    PaddleTensor<int64_t> result =
            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_poly_fixed(0, in, out0.get());
+            test_fixedt_neq_plain(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_poly_fixed(1, in, out1.get());
+            test_fixedt_neq_plain(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_poly_fixed(2, in, out2.get());
+            test_fixedt_neq_plain(2, in, out2.get());
        });

    });
@@ -2364,43 +2488,46 @@ TEST_F(FixedTensorTest, polynomial) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, polynomial_wise) {
-    // y = x + 1 (x >= 0)
-    // y = 1 (x < 0)
+TEST_F(FixedTensorTest, neq_fixed) {
+
    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {-1.0, 1.0, 2.0, 2.0};
-    //std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {1.0, 2.0, 3.0, 3.0};
-    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+    std::vector<double> in0_val = {3.0, 2.0, 3.0, 3.0};
+    std::vector<double> in1_val = {2.0, 3.0, 3.0, 3.0};
+    std::vector<double> res_val = {1 / pow(2, 16), 1 / pow(2, 16), 0, 0};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in =
+                            {gen(shape), gen(shape)};

    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
                                shape, _cpu_ctx).copy(in[0].get());
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in1_val,
+                                shape, _cpu_ctx).copy(in[1].get());
    //not copy scaling factor in copy funtion
    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
                                scaling_factor() = 16;
+    dynamic_cast<PaddleTensor<int64_t>*>(in[1].get())->
+                                scaling_factor() = 16;

    auto out0 = _s_tensor_factory->create<int64_t>(shape);
    auto out1 = _s_tensor_factory->create<int64_t>(shape);
    auto out2 = _s_tensor_factory->create<int64_t>(shape);
-
    PaddleTensor<int64_t> result =
            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_poly_wise_fixed(0, in, out0.get());
+            test_fixedt_neq_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_poly_wise_fixed(1, in, out1.get());
+            test_fixedt_neq_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_poly_wise_fixed(2, in, out2.get());
+            test_fixedt_neq_fixed(2, in, out2.get());
        });

    });
@@ -2414,12 +2541,11 @@ TEST_F(FixedTensorTest, polynomial_wise) {
    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
 }

-TEST_F(FixedTensorTest, relu) {
+TEST_F(FixedTensorTest, exp_fixed) {

    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, -1.0, -2, 2};
-    //std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {1.0, 0.0, 0.0, 2};
+    std::vector<double> in0_val = {0.0, 0.0, 1.0, 1.0};
+    std::vector<double> res_val = {1.0, 1.0, 2.71828, 2.71828};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};

    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
@@ -2437,19 +2563,19 @@ TEST_F(FixedTensorTest, relu) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_relu_fixed(0, in, out0.get());
+            test_fixedt_exp_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_relu_fixed(1, in, out1.get());
+            test_fixedt_exp_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_relu_fixed(2, in, out2.get());
+            test_fixedt_exp_fixed(2, in, out2.get());
        });

    });
@@ -2458,17 +2584,19 @@ TEST_F(FixedTensorTest, relu) {
    _t[1].join();
    _t[2].join();

-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get(), 0.01, true));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get(), 0.01, true));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result, 0.01, true));
 }

-TEST_F(FixedTensorTest, softmax) {
+TEST_F(FixedTensorTest, exp_fixed_low_bound) {

-    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {1.0, 1.0, 1, 1};
-    //std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {0.5, 0.5, 0.5, 0.5};
+    std::vector<size_t> shape = {1, 3};
+    // exp approximate: exp(x) = \lim_{n->inf} (1+x/n)^n
+    // where n = 2^ite = 256, therefore, exp(-512) = exp(0),
+    // exp(-511) = exp(-1), exp(-256) = 0
+    std::vector<double> in0_val = {-512, -511, -256};
+    std::vector<double> res_val = {1, 0.367879, 0};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};

    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
@@ -2486,19 +2614,19 @@ TEST_F(FixedTensorTest, softmax) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_softmax_fixed(0, in, out0.get());
+            test_fixedt_exp_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_softmax_fixed(1, in, out1.get());
+            test_fixedt_exp_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_softmax_fixed(2, in, out2.get());
+            test_fixedt_exp_fixed(2, in, out2.get());
        });

    });
@@ -2507,17 +2635,16 @@ TEST_F(FixedTensorTest, softmax) {
    _t[1].join();
    _t[2].join();

-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get(), 0.01, true));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get(), 0.01, true));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result, 0.01, true));
 }

-TEST_F(FixedTensorTest, sigmoid) {
-
-    std::vector<size_t> shape = {2, 2};
-    std::vector<float> in0_val = {0.0, 0.0, -0.5, 0.5};
-    //std::vector<float> in1_val = {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> res_val = {0.5, 0.5, 0.3775, 0.6225};
+TEST_F(FixedTensorTest, exp_fixed_upper_bound) {
+    std::vector<size_t> shape = {1};
+    // input large than 15 may get error result because of multiplication error
+    std::vector<double> in0_val = {15};
+    std::vector<double> res_val = {3269017.37};
    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};

    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
@@ -2535,19 +2662,19 @@ TEST_F(FixedTensorTest, sigmoid) {

    _t[0] = std::thread([this, in, out0]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
-            test_fixedt_sigmoid_fixed(0, in, out0.get());
+            test_fixedt_exp_fixed(0, in, out0.get());
        });

    });
    _t[1] = std::thread([this, in, out1]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
-            test_fixedt_sigmoid_fixed(1, in, out1.get());
+            test_fixedt_exp_fixed(1, in, out1.get());
        });

    });
    _t[2] = std::thread([this, in, out2]() mutable {
        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
-            test_fixedt_sigmoid_fixed(2, in, out2.get());
+            test_fixedt_exp_fixed(2, in, out2.get());
        });

    });
@@ -2556,9 +2683,758 @@ TEST_F(FixedTensorTest, sigmoid) {
    _t[1].join();
    _t[2].join();

-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get(), 0.1));
-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get(), 0.1));
-    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result, 0.1));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get(), 0.4, true));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get(), 0.4, true));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result, 0.4, true));
+}
+
+TEST_F(FixedTensorTest, polynomial) {
+    // y = 1 + x
+    std::vector<size_t> shape = {2, 2};
+    std::vector<double> in0_val = {-1.0, 2.0, 2.0, 2.0};
+    //std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {0.0, 3.0, 3.0, 3.0};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
+                                shape, _cpu_ctx).copy(in[0].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+
+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_poly_fixed(0, in, out0.get());
+        });
+
+    });
+    _t[1] = std::thread([this, in, out1]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
+            test_fixedt_poly_fixed(1, in, out1.get());
+        });
+
+    });
+    _t[2] = std::thread([this, in, out2]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
+            test_fixedt_poly_fixed(2, in, out2.get());
+        });
+
+    });
+
+    _t[0].join();
+    _t[1].join();
+    _t[2].join();
+
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
+}
+
+TEST_F(FixedTensorTest, polynomial_wise) {
+    // y = x + 1 (x >= 0)
+    // y = 1 (x < 0)
+    std::vector<size_t> shape = {2, 2};
+    std::vector<double> in0_val = {-1.0, 1.0, 2.0, 2.0};
+    //std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {1.0, 2.0, 3.0, 3.0};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
+                                shape, _cpu_ctx).copy(in[0].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+
+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_poly_wise_fixed(0, in, out0.get());
+        });
+
+    });
+    _t[1] = std::thread([this, in, out1]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
+            test_fixedt_poly_wise_fixed(1, in, out1.get());
+        });
+
+    });
+    _t[2] = std::thread([this, in, out2]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
+            test_fixedt_poly_wise_fixed(2, in, out2.get());
+        });
+
+    });
+
+    _t[0].join();
+    _t[1].join();
+    _t[2].join();
+
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
+}
+
+TEST_F(FixedTensorTest, relu) {
+
+    std::vector<size_t> shape = {2, 2};
+    std::vector<double> in0_val = {1.0, -1.0, -2, 2};
+    //std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {1.0, 0.0, 0.0, 2};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
+                                shape, _cpu_ctx).copy(in[0].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+
+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_relu_fixed(0, in, out0.get());
+        });
+
+    });
+    _t[1] = std::thread([this, in, out1]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
+            test_fixedt_relu_fixed(1, in, out1.get());
+        });
+
+    });
+    _t[2] = std::thread([this, in, out2]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
+            test_fixedt_relu_fixed(2, in, out2.get());
+        });
+
+    });
+
+    _t[0].join();
+    _t[1].join();
+    _t[2].join();
+
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
+}
+
+TEST_F(FixedTensorTest, relu_low_bound) {
+
+    std::vector<size_t> shape = {1};
+    std::vector<double> in0_val = {-0x1p-20};
+    std::vector<double> res_val = {0.0};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
+                                shape, _cpu_ctx).copy(in[0].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+
+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_relu_fixed(0, in, out0.get());
+        });
+
+    });
+    _t[1] = std::thread([this, in, out1]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
+            test_fixedt_relu_fixed(1, in, out1.get());
+        });
+
+    });
+    _t[2] = std::thread([this, in, out2]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
+            test_fixedt_relu_fixed(2, in, out2.get());
+        });
+
+    });
+
+    _t[0].join();
+    _t[1].join();
+    _t[2].join();
+
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
+}
+
+TEST_F(FixedTensorTest, relu_upper_bound) {
+
+    std::vector<size_t> shape = {1};
+    std::vector<double> in0_val = {0x1p20};
+    std::vector<double> res_val = {0x1p20};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
+                                shape, _cpu_ctx).copy(in[0].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+
+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_relu_fixed(0, in, out0.get());
+        });
+
+    });
+    _t[1] = std::thread([this, in, out1]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
+            test_fixedt_relu_fixed(1, in, out1.get());
+        });
+
+    });
+    _t[2] = std::thread([this, in, out2]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
+            test_fixedt_relu_fixed(2, in, out2.get());
+        });
+
+    });
+
+    _t[0].join();
+    _t[1].join();
+    _t[2].join();
+
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
+}
+
+TEST_F(FixedTensorTest, relu2) {
+
+    std::vector<size_t> shape = {2, 2};
+    std::vector<double> in0_val = {1.0, -1.0, -2, 2};
+    //std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {1.0, 0.0, 0.0, 2};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
+                                shape, _cpu_ctx).copy(in[0].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+
+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_relu2_fixed(0, in, out0.get());
+        });
+
+    });
+    _t[1] = std::thread([this, in, out1]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
+            test_fixedt_relu2_fixed(1, in, out1.get());
+        });
+
+    });
+    _t[2] = std::thread([this, in, out2]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
+            test_fixedt_relu2_fixed(2, in, out2.get());
+        });
+
+    });
+
+    _t[0].join();
+    _t[1].join();
+    _t[2].join();
+
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get()));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result));
+}
+
+TEST_F(FixedTensorTest, softmax) {
+
+    std::vector<size_t> shape = {2, 2};
+    std::vector<double> in0_val = {1.0, 1.0, 1, 1};
+    //std::vector<double> in1_val = {2.0, 2.0, 2.0, 2.0};
+    std::vector<double> res_val = {0.5, 0.5, 0.5, 0.5};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
+                                shape, _cpu_ctx).copy(in[0].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+
+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_softmax_fixed(0, in, out0.get());
+        });
+
+    });
+    _t[1] = std::thread([this, in, out1]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
+            test_fixedt_softmax_fixed(1, in, out1.get());
+        });
+
+    });
+    _t[2] = std::thread([this, in, out2]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
+            test_fixedt_softmax_fixed(2, in, out2.get());
+        });
+
+    });
+
+    _t[0].join();
+    _t[1].join();
+    _t[2].join();
+
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get(), 0.1));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get(), 0.1));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result, 0.1));
+}
+
+TEST_F(FixedTensorTest, sigmoid_chebyshev) {
+
+    std::vector<size_t> shape = {2, 2};
+    // larger error when input < -3 or >4
+    std::vector<double> in0_val = {1.0, 2.0, -3.0, 4.0};
+    std::vector<double> res_val = {0.73105, 0.88079, 0.0474, 0.9820};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
+                                shape, _cpu_ctx).copy(in[0].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+
+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_sigmoid_chebyshev_fixed(0, in, out0.get());
+        });
+
+    });
+    _t[1] = std::thread([this, in, out1]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
+            test_fixedt_sigmoid_chebyshev_fixed(1, in, out1.get());
+        });
+
+    });
+    _t[2] = std::thread([this, in, out2]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
+            test_fixedt_sigmoid_chebyshev_fixed(2, in, out2.get());
+        });
+
+    });
+
+    _t[0].join();
+    _t[1].join();
+    _t[2].join();
+
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get(), 0.03));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get(), 0.03));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result, 0.03));
+}
+
+TEST_F(FixedTensorTest, sigmoid) {
+
+    std::vector<size_t> shape = {2, 2};
+    std::vector<double> in0_val = {0.0, 3, 7, 0.5};
+    std::vector<double> res_val = {0.5, 0.9525, 0.999, 0.6225};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
+                                shape, _cpu_ctx).copy(in[0].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+
+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_sigmoid_fixed(0, in, out0.get());
+        });
+
+    });
+    _t[1] = std::thread([this, in, out1]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
+            test_fixedt_sigmoid_fixed(1, in, out1.get());
+        });
+
+    });
+    _t[2] = std::thread([this, in, out2]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
+            test_fixedt_sigmoid_fixed(2, in, out2.get());
+        });
+
+    });
+
+    _t[0].join();
+    _t[1].join();
+    _t[2].join();
+
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get(), 0.08));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get(), 0.08));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result, 0.08));
+}
+
+TEST_F(FixedTensorTest, sigmoid_enhanced) {
+
+    std::vector<size_t> shape = {2, 2};
+    std::vector<double> in0_val = {0.0, 3, 7, 0.5};
+    std::vector<double> res_val = {0.5, 0.9525, 0.999, 0.6225};
+    std::vector<std::shared_ptr<TensorAdapter<int64_t>>> in = {gen(shape)};
+
+    test_fixedt_gen_paddle_tensor<int64_t, 16>(in0_val,
+                                shape, _cpu_ctx).copy(in[0].get());
+    //not copy scaling factor in copy funtion
+    dynamic_cast<PaddleTensor<int64_t>*>(in[0].get())->
+                                scaling_factor() = 16;
+
+    auto out0 = _s_tensor_factory->create<int64_t>(shape);
+    auto out1 = _s_tensor_factory->create<int64_t>(shape);
+    auto out2 = _s_tensor_factory->create<int64_t>(shape);
+
+    PaddleTensor<int64_t> result =
+            test_fixedt_gen_paddle_tensor<int64_t, 16>(res_val, shape, _cpu_ctx);
+
+    _t[0] = std::thread([this, in, out0]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[0], [&](){
+            test_fixedt_sigmoid_enhanced_fixed(0, in, out0.get());
+        });
+
+    });
+    _t[1] = std::thread([this, in, out1]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[1], [&](){
+            test_fixedt_sigmoid_enhanced_fixed(1, in, out1.get());
+        });
+
+    });
+    _t[2] = std::thread([this, in, out2]() mutable {
+        g_ctx_holder::template run_with_context(_exec_ctx.get(), _mpc_ctx[2], [&](){
+            test_fixedt_sigmoid_enhanced_fixed(2, in, out2.get());
+        });
+
+    });
+
+    _t[0].join();
+    _t[1].join();
+    _t[2].join();
+
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), out1.get(), 0.08));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out1.get(), out2.get(), 0.08));
+    EXPECT_TRUE(test_fixedt_check_tensor_eq(out0.get(), &result, 0.08));
+}
+
+TEST_F(FixedTensorTest, max_test) {
+    std::vector<size_t> shape = { 1 };
+    std::shared_ptr<TensorAdapter<int64_t>> sl[3] = { gen(shape), gen(shape), gen(shape) };
+    std::shared_ptr<TensorAdapter<int64_t>> sr[3] = { gen(shape), gen(shape), gen(shape) };
+
+    std::shared_ptr<TensorAdapter<int64_t>> sout[6] = { gen(shape), gen(shape), gen(shape),
+                                                        gen(shape), gen(shape), gen(shape)};
+
+    std::shared_ptr<TensorAdapter<int64_t>> sbout[6] = {
+        gen(shape), gen(shape), gen(shape), gen(shape), gen(shape), gen(shape)};
+
+    // lhs = 6 = 1 + 2 + 3
+    sl[0]->data()[0] = 1;
+    sl[1]->data()[0] = 2;
+    sl[2]->data()[0] = 3;
+    // rhs = 15 = 4 + 5 + 6
+    sr[0]->data()[0] = 4;
+    sr[1]->data()[0] = 5;
+    sr[2]->data()[0] = 6;
+    Fix64N16 fl0(sl[0].get(), sl[1].get());
+    Fix64N16 fl1(sl[1].get(), sl[2].get());
+    Fix64N16 fl2(sl[2].get(), sl[0].get());
+    Fix64N16 fr0(sr[0].get(), sr[1].get());
+    Fix64N16 fr1(sr[1].get(), sr[2].get());
+    Fix64N16 fr2(sr[2].get(), sr[0].get());
+    Fix64N16 fout0(sout[0].get(), sout[1].get());
+    Fix64N16 fout1(sout[2].get(), sout[3].get());
+    Fix64N16 fout2(sout[4].get(), sout[5].get());
+    BooleanTensor<int64_t> bout0(sbout[0].get(), sbout[1].get());
+    BooleanTensor<int64_t> bout1(sbout[2].get(), sbout[3].get());
+    BooleanTensor<int64_t> bout2(sbout[4].get(), sbout[5].get());
+
+    auto p = gen(shape);
+    auto pb = gen(shape);
+
+    _t[0] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[0], [&](){
+                fl0.max(&fr0, &fout0, &bout0);
+                fout0.reveal_to_one(0, p.get());
+                bout0.reveal_to_one(0, pb.get());
+            });
+        }
+    );
+    _t[1] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[1], [&](){
+                fl1.max(&fr1, &fout1, &bout1);
+                fout1.reveal_to_one(0, nullptr);
+                bout1.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+    _t[2] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[2], [&](){
+                fl2.max(&fr2, &fout2, &bout2);
+                fout2.reveal_to_one(0, nullptr);
+                bout2.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+    for (auto &t: _t) {
+        t.join();
+    }
+    EXPECT_EQ(std::max(6, 15), p->data()[0]);
+    EXPECT_EQ(1, pb->data()[0]);
+}
+
+TEST_F(FixedTensorTest, max_test2) {
+    std::vector<size_t> shape = { 1 };
+    std::shared_ptr<TensorAdapter<int64_t>> sl[3] = { gen(shape), gen(shape), gen(shape) };
+    std::shared_ptr<TensorAdapter<int64_t>> sout[6] = { gen(shape), gen(shape), gen(shape),
+                                                        gen(shape), gen(shape), gen(shape)};
+    // lhs = 6 = 1 + 2 + 3
+    sl[0]->data()[0] = 1 << 16;
+    sl[1]->data()[0] = 2 << 16;
+    sl[2]->data()[0] = 3 << 16;
+
+    auto pr = gen(shape);
+
+    // rhs = 15
+    pr->data()[0] = 15 << 16;
+    pr->scaling_factor() = 16;
+    Fix64N16 fl0(sl[0].get(), sl[1].get());
+    Fix64N16 fl1(sl[1].get(), sl[2].get());
+    Fix64N16 fl2(sl[2].get(), sl[0].get());
+    Fix64N16 fout0(sout[0].get(), sout[1].get());
+    Fix64N16 fout1(sout[2].get(), sout[3].get());
+    Fix64N16 fout2(sout[4].get(), sout[5].get());
+
+    auto p = gen(shape);
+
+    _t[0] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[0], [&](){
+                fl0.max(pr.get(), &fout0);
+                fout0.reveal_to_one(0, p.get());
+            });
+        }
+    );
+    _t[1] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[1], [&](){
+                fl1.max(pr.get(), &fout1);
+                fout1.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+    _t[2] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[2], [&](){
+                fl2.max(pr.get(), &fout2);
+                fout2.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+    for (auto &t: _t) {
+        t.join();
+    }
+    EXPECT_EQ(std::max(6, 15), p->data()[0] >> 16);
+}
+
+TEST_F(FixedTensorTest, max_pooling_test) {
+    std::vector<size_t> shape = { 4, 1 };
+    std::vector<size_t> shape_ = { 1, 1 };
+
+    std::shared_ptr<TensorAdapter<int64_t>> sl[3] = { gen(shape), gen(shape), gen(shape) };
+    std::shared_ptr<TensorAdapter<int64_t>> sfout[6] = {
+        gen(shape_), gen(shape_), gen(shape_), gen(shape_), gen(shape_), gen(shape_)};
+    std::shared_ptr<TensorAdapter<int64_t>> sbout[6] = {
+        gen(shape), gen(shape), gen(shape), gen(shape), gen(shape), gen(shape)};
+
+    assign_to_tensor(sl[1].get(), 0l);
+    assign_to_tensor(sl[2].get(), 0l);
+    sl[0]->data()[0] = 2;
+    sl[0]->data()[1] = 1;
+    sl[0]->data()[2] = 4;
+    sl[0]->data()[3] = 3;
+    // input [2 1 4 3]
+
+    auto pmax = gen(shape_);
+    auto ppos = gen(shape);
+
+    Fix64N16 fl0(sl[0].get(), sl[1].get());
+    Fix64N16 fl1(sl[1].get(), sl[2].get());
+    Fix64N16 fl2(sl[2].get(), sl[0].get());
+
+    Fix64N16 fout0(sfout[0].get(), sfout[1].get());
+    Fix64N16 fout1(sfout[2].get(), sfout[3].get());
+    Fix64N16 fout2(sfout[4].get(), sfout[5].get());
+
+    BooleanTensor<int64_t> bout0(sbout[0].get(), sbout[1].get());
+    BooleanTensor<int64_t> bout1(sbout[2].get(), sbout[3].get());
+    BooleanTensor<int64_t> bout2(sbout[4].get(), sbout[5].get());
+
+    _t[0] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[0], [&](){
+                fl0.max_pooling(&fout0, &bout0);
+                fout0.reveal_to_one(0, pmax.get());
+                bout0.reveal_to_one(0, ppos.get());
+            });
+        }
+    );
+    _t[1] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[1], [&](){
+                fl1.max_pooling(&fout1, &bout1);
+                fout1.reveal_to_one(0, nullptr);
+                bout1.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+    _t[2] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[2], [&](){
+                fl2.max_pooling(&fout2, &bout2);
+                fout2.reveal_to_one(0, nullptr);
+                bout2.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+    for (auto &t: _t) {
+        t.join();
+    }
+
+    EXPECT_EQ(4, pmax->data()[0]);
+
+    EXPECT_EQ(0, ppos->data()[0]);
+    EXPECT_EQ(0, ppos->data()[1]);
+    EXPECT_EQ(1, ppos->data()[2]);
+    EXPECT_EQ(0, ppos->data()[3]);
+}
+
+TEST_F(FixedTensorTest, inv_sqrt_test) {
+    std::vector<size_t> shape = { 1 };
+
+    std::shared_ptr<TensorAdapter<int64_t>> sl[3] = { gen(shape), gen(shape), gen(shape) };
+    std::shared_ptr<TensorAdapter<int64_t>> sfout[6] = {
+        gen(shape), gen(shape), gen(shape), gen(shape), gen(shape), gen(shape)};
+
+    sl[0]->data()[0] = 0x4p16;
+    sl[1]->data()[0] = 0;
+    sl[2]->data()[0] = 0;
+    // input [4]
+
+    auto p = gen(shape);
+
+    Fix64N16 fl0(sl[0].get(), sl[1].get());
+    Fix64N16 fl1(sl[1].get(), sl[2].get());
+    Fix64N16 fl2(sl[2].get(), sl[0].get());
+
+    Fix64N16 fout0(sfout[0].get(), sfout[1].get());
+    Fix64N16 fout1(sfout[2].get(), sfout[3].get());
+    Fix64N16 fout2(sfout[4].get(), sfout[5].get());
+
+    _t[0] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[0], [&](){
+                fl0.inverse_square_root(&fout0);
+                fout0.reveal_to_one(0, p.get());
+            });
+        }
+    );
+    _t[1] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[1], [&](){
+                fl1.inverse_square_root(&fout1);
+                fout1.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+    _t[2] = std::thread(
+        [&] () {
+        g_ctx_holder::template run_with_context(
+            _exec_ctx.get(), _mpc_ctx[2], [&](){
+                fl2.inverse_square_root(&fout2);
+                fout2.reveal_to_one(0, nullptr);
+            });
+        }
+    );
+    for (auto &t: _t) {
+        t.join();
+    }
+
+    // inv_sqrt(4) = 1/2
+    EXPECT_NEAR(0.5, p->data()[0] / 0x1p16f, 2 / 0x1p16f);
+
 }

 } // namespace aby3
--- a/core/privc3/fixedpoint_util.h
+++ b/core/privc3/fixedpoint_util.h
@@ -20,29 +20,19 @@
 #include "prng_utils.h"

 namespace aby3 {
-template <typename T, size_t N> class FixedPointUtil {
+template<typename T, size_t N>
+class FixedPointUtil {
 public:
-  static double reveal(T *shares[3]) {
-    // reveal
+
+    static double reveal(T* shares[3]) {
+        //reveal
        T sum = *shares[0] + *shares[1] + *shares[2];
-    // to double
-    int neg = sum < 0 ? -1 : 1;
-    sum = sum * neg;
-    T high = sum >> N;
-    T low = sum & (((T)1 << N) - 1);
-    double ret = high + low / pow(2, N);
-    return neg * ret;
+        return sum / pow(2, N);
    }

-  static void share(double input, T *ret[3]) {
-    // to int
-    int neg = input < 0 ? -1 : 1;
-    double val = input * neg;
-    T high = val;
-    double low = val - high;
-    T ll_in = ((T)high << N) + (T)(low * pow(2, N));
-    ll_in *= neg;
-    // share
+    static void share(double input, T* ret[3]) {
+        T ll_in = (T) (input * pow(2, N));
+        //share
        *ret[0] = _s_prng.get<T>();
        *ret[1] = _s_prng.get<T>();
        *ret[2] = ll_in - *ret[0] - *ret[1];
@@ -51,8 +41,7 @@ public:
    static PseudorandomNumberGenerator _s_prng;
 };

-template <typename T, size_t N>
-PseudorandomNumberGenerator
-    FixedPointUtil<T, N>::_s_prng(block_from_dev_urandom());
+template<typename T, size_t N>
+PseudorandomNumberGenerator FixedPointUtil<T, N>::_s_prng(block_from_dev_urandom());

-} // namespace aby3
+} //namespace aby3
--- a/core/privc3/fixedpoint_util_test.cc
+++ b/core/privc3/fixedpoint_util_test.cc
@@ -14,7 +14,7 @@

 #include "fixedpoint_util.h"
 #include "gtest/gtest.h"
-
+// test
 namespace aby3 {

 TEST(FixedPointUtil, int64_test) {

--- a/core/privc3/tensor_adapter.h
+++ b/core/privc3/tensor_adapter.h
@@ -19,76 +19,81 @@

 namespace aby3 {

-template <typename T> class TensorAdapter {
+template <typename T>
+class TensorAdapter {
 public:
+
    TensorAdapter() = default;

    virtual ~TensorAdapter() = default;

-  virtual T *data() = 0;
+    virtual T* data() = 0;

-  virtual const T *data() const = 0;
+    virtual const T* data() const = 0;

    virtual std::vector<size_t> shape() const = 0;

-  virtual void reshape(const std::vector<size_t> &shape) = 0;
+    virtual void reshape(const std::vector<size_t>& shape) = 0;

    virtual size_t numel() const = 0;

-  virtual void copy(TensorAdapter *ret) const {
+    virtual void copy(TensorAdapter* ret) const {
        // TODO: check shape equals
        std::copy(data(), data() + numel(), ret->data());
    }

    // element wise op, need operands' dim are same
-  virtual void add(const TensorAdapter *rhs, TensorAdapter *ret) const = 0;
+    virtual void add(const TensorAdapter* rhs, TensorAdapter* ret) const = 0;

    // element wise op, need operands' dim are same
-  virtual void sub(const TensorAdapter *rhs, TensorAdapter *ret) const = 0;
+    virtual void sub(const TensorAdapter* rhs, TensorAdapter* ret) const = 0;

-  virtual void negative(TensorAdapter *ret) const = 0;
+    virtual void negative(TensorAdapter* ret) const = 0;

    // element wise op, need operands' dim are same
-  virtual void mul(const TensorAdapter *rhs, TensorAdapter *ret) const = 0;
+    virtual void mul(const TensorAdapter* rhs, TensorAdapter* ret) const = 0;

    // element wise op, need operands' dim are same
-  virtual void div(const TensorAdapter *rhs, TensorAdapter *ret) const = 0;
+    virtual void div(const TensorAdapter* rhs, TensorAdapter* ret) const = 0;

    // 2d matrix muliply,  need operands' rank are 2
-  virtual void mat_mul(const TensorAdapter *rhs, TensorAdapter *ret) const = 0;
+    virtual void mat_mul(const TensorAdapter* rhs, TensorAdapter* ret) const = 0;

    // element wise op, need operands' dim are same
-  virtual void bitwise_xor(const TensorAdapter *rhs,
-                           TensorAdapter *ret) const = 0;
+    virtual void bitwise_xor(const TensorAdapter* rhs, TensorAdapter* ret) const = 0;

    // element wise op, need operands' dim are same
-  virtual void bitwise_and(const TensorAdapter *rhs,
-                           TensorAdapter *ret) const = 0;
+    virtual void bitwise_and(const TensorAdapter* rhs, TensorAdapter* ret) const = 0;

    // element wise op, need operands' dim are same
-  virtual void bitwise_or(const TensorAdapter *rhs,
-                          TensorAdapter *ret) const = 0;
+    virtual void bitwise_or(const TensorAdapter* rhs, TensorAdapter* ret) const = 0;

    // element wise op, need operands' dim are same
-  virtual void bitwise_not(TensorAdapter *ret) const = 0;
+    virtual void bitwise_not(TensorAdapter* ret) const = 0;

-  virtual void lshift(size_t rhs, TensorAdapter *ret) const = 0;
+    virtual void lshift(size_t rhs, TensorAdapter* ret) const = 0;

-  virtual void rshift(size_t rhs, TensorAdapter *ret) const = 0;
+    virtual void rshift(size_t rhs, TensorAdapter* ret) const = 0;

-  virtual void logical_rshift(size_t rhs, TensorAdapter *ret) const = 0;
+    virtual void logical_rshift(size_t rhs, TensorAdapter* ret) const = 0;

    // when using an integer type T as fixed-point number
    // value of T val is interpreted as val / 2 ^ scaling_factor()
    virtual size_t scaling_factor() const = 0;

-  virtual size_t &scaling_factor() = 0;
+    virtual size_t& scaling_factor() = 0;

    // slice by shape[0]
    // e.g. x.shape = [ 2, 3, 4]
    //      x.slice(1, 2, y)
    //      y.shape = [ 1, 3, 4]
-  virtual void slice(size_t begin_idx, size_t end_idx,
-                     TensorAdapter *out) const = 0;
+    virtual void slice(size_t begin_idx, size_t end_idx, TensorAdapter* out) const = 0;
 };
+
+template<typename T>
+inline void assign_to_tensor(TensorAdapter<T>* input, T assign_num) {
+    std::transform(input->data(), input->data() + input->numel(),
+                   input->data(), [assign_num](T) { return assign_num; });
+}
+
 } // namespace aby3
--- a/core/psi/CMakeLists.txt
+++ b/core/psi/CMakeLists.txt
-add_compile_options(-msse4.2 -maes)
-
 set(PSI_SRCS
    "./aes.cc"
    "./cuckoo_hash.cc"
@@ -16,7 +14,11 @@ add_dependencies(psi_o crypto)

 add_library(psi SHARED $<TARGET_OBJECTS:psi_o>)

-target_link_libraries(psi crypto)
+if (USE_OPENMP)
+    target_link_libraries(psi OpenMP::OpenMP_CXX OpenMP::OpenMP_C crypto)
+else()
+    target_link_libraries(psi crypto)
+endif (USE_OPENMP)

 cc_test(aes_test SRCS aes_test.cc DEPS psi)
 cc_test(ot_test SRCS ot_test.cc DEPS psi)

--- a/core/psi/aes.cc
+++ b/core/psi/aes.cc
@@ -14,10 +14,13 @@

 #include "aes.h"

+#ifdef USE_AES_NI
 #include <wmmintrin.h>
+#endif

 namespace psi {

+#ifdef USE_AES_NI
 static block aes128_key_expansion(block key, block key_rcon) {
    key_rcon = _mm_shuffle_epi32(key_rcon, _MM_SHUFFLE(3, 3, 3, 3));
    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
@@ -26,9 +29,7 @@ static block aes128_key_expansion(block key, block key_rcon) {
    return _mm_xor_si128(key, key_rcon);
 }

-AES::AES(const block &user_key) { set_key(user_key); }
-
-void AES::set_key(const block &user_key) {
+void AES::set_key(const block& user_key) {
    _round_key[0] = user_key;
    _round_key[1] = aes128_key_expansion(
        _round_key[0], _mm_aeskeygenassist_si128(_round_key[0], 0x01));
@@ -52,7 +53,7 @@ void AES::set_key(const block &user_key) {
        _round_key[9], _mm_aeskeygenassist_si128(_round_key[9], 0x36));
 }

-void AES::ecb_enc_block(const block &plaintext, block &cyphertext) const {
+void AES::ecb_enc_block(const block& plaintext, block& cyphertext) const {
    cyphertext = _mm_xor_si128(plaintext, _round_key[0]);
    cyphertext = _mm_aesenc_si128(cyphertext, _round_key[1]);
    cyphertext = _mm_aesenc_si128(cyphertext, _round_key[2]);
@@ -66,57 +67,37 @@ void AES::ecb_enc_block(const block &plaintext, block &cyphertext) const {
    cyphertext = _mm_aesenclast_si128(cyphertext, _round_key[10]);
 }

-block AES::ecb_enc_block(const block &plaintext) const {
-  block ret;
-  ecb_enc_block(plaintext, ret);
-  return ret;
+#else
+// openssl aes
+void AES::set_key(const block& user_key) {
+    // sizeof block = 128 bit
+    AES_set_encrypt_key(reinterpret_cast<const unsigned char*>(&user_key),
+                        128, &_aes_key);
 }

-#define REPEATED_FUNC(func, idx, out, in, k)                                   \
-  do {                                                                         \
-    out[idx + 0] = func(in[idx + 0], k);                                       \
-    out[idx + 1] = func(in[idx + 1], k);                                       \
-    out[idx + 2] = func(in[idx + 2], k);                                       \
-    out[idx + 3] = func(in[idx + 3], k);                                       \
-    out[idx + 4] = func(in[idx + 4], k);                                       \
-    out[idx + 5] = func(in[idx + 5], k);                                       \
-    out[idx + 6] = func(in[idx + 6], k);                                       \
-    out[idx + 7] = func(in[idx + 7], k);                                       \
-  } while (0)
+void AES::ecb_enc_block(const block& plaintext, block& cyphertext) const {
+    AES_encrypt(reinterpret_cast<const unsigned char*>(&plaintext),
+                reinterpret_cast<unsigned char*>(&cyphertext),
+                &_aes_key);
+}
+#endif

-void AES::ecb_enc_blocks(const block *plaintexts, size_t block_num,
-                         block *cyphertext) const {
-  const size_t step = 8;
-  size_t idx = 0;
-  size_t length = block_num - block_num % step;
+void AES::ecb_enc_blocks(const block* plaintexts, size_t block_num,
+                         block* cyphertext) const {

-  for (; idx < length; idx += step) {
-    REPEATED_FUNC(_mm_xor_si128, idx, cyphertext, plaintexts, _round_key[0]);
-    REPEATED_FUNC(_mm_aesenc_si128, idx, cyphertext, cyphertext, _round_key[1]);
-    REPEATED_FUNC(_mm_aesenc_si128, idx, cyphertext, cyphertext, _round_key[2]);
-    REPEATED_FUNC(_mm_aesenc_si128, idx, cyphertext, cyphertext, _round_key[3]);
-    REPEATED_FUNC(_mm_aesenc_si128, idx, cyphertext, cyphertext, _round_key[4]);
-    REPEATED_FUNC(_mm_aesenc_si128, idx, cyphertext, cyphertext, _round_key[5]);
-    REPEATED_FUNC(_mm_aesenc_si128, idx, cyphertext, cyphertext, _round_key[6]);
-    REPEATED_FUNC(_mm_aesenc_si128, idx, cyphertext, cyphertext, _round_key[7]);
-    REPEATED_FUNC(_mm_aesenc_si128, idx, cyphertext, cyphertext, _round_key[8]);
-    REPEATED_FUNC(_mm_aesenc_si128, idx, cyphertext, cyphertext, _round_key[9]);
-    REPEATED_FUNC(_mm_aesenclast_si128, idx, cyphertext, cyphertext,
-                  _round_key[10]);
+#pragma omp parallel num_threads(4)
+#pragma omp for
+    for (size_t i = 0; i < block_num; ++i) {
+        ecb_enc_block(plaintexts[i], cyphertext[i]);
    }
+}

-  for (; idx < block_num; ++idx) {
-    cyphertext[idx] = _mm_xor_si128(plaintexts[idx], _round_key[0]);
-    cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], _round_key[1]);
-    cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], _round_key[2]);
-    cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], _round_key[3]);
-    cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], _round_key[4]);
-    cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], _round_key[5]);
-    cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], _round_key[6]);
-    cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], _round_key[7]);
-    cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], _round_key[8]);
-    cyphertext[idx] = _mm_aesenc_si128(cyphertext[idx], _round_key[9]);
-    cyphertext[idx] = _mm_aesenclast_si128(cyphertext[idx], _round_key[10]);
-  }
+AES::AES(const block& user_key) { set_key(user_key); }
+
+block AES::ecb_enc_block(const block& plaintext) const {
+    block ret;
+    ecb_enc_block(plaintext, ret);
+    return ret;
 }
+
 } // namespace psi
--- a/core/psi/aes.h
+++ b/core/psi/aes.h
@@ -16,6 +16,10 @@

 #include <emmintrin.h>

+#ifndef USE_AES_NI
+#include <openssl/aes.h>
+#endif
+
 namespace psi {

 using block = __m128i;
@@ -24,23 +28,27 @@ class AES {
 public:
    AES() {}

-  AES(const block &user_key);
-
-  AES(const AES &other) = delete;
+    AES(const block& user_key);

-  AES &operator=(const AES &other) = delete;
+    AES(const AES& other) = delete;

-  void set_key(const block &user_key);
+    AES& operator=(const AES& other) = delete;

-  void ecb_enc_block(const block &plaintext, block &cyphertext) const;
+    void set_key(const block& user_key);

-  block ecb_enc_block(const block &plaintext) const;
+    void ecb_enc_block(const block& plaintext, block& cyphertext) const;

-  void ecb_enc_blocks(const block *plaintexts, size_t block_num,
-                      block *ciphertext) const;
+    block ecb_enc_block(const block& plaintext) const;

+    void ecb_enc_blocks(const block* plaintexts, size_t block_num,
+                        block* ciphertext) const;
 private:
+#ifdef USE_AES_NI
    block _round_key[11];
+#else
+    AES_KEY _aes_key;
+#endif
 };

 } // namespace psi
+
--- a/core/psi/aes_test.cc
+++ b/core/psi/aes_test.cc
@@ -14,7 +14,9 @@

 #include "aes.h"

+#include <chrono>
 #include <cstring>
+#include <iostream>

 #include "gtest/gtest.h"

@@ -24,16 +26,13 @@ namespace psi {

 TEST(aes, base_test) {
    std::string plain("\x00\x11\x22\x33\x44\x55\x66\x77"
-                    "\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
-                    16);
+                      "\x88\x99\xaa\xbb\xcc\xdd\xee\xff", 16);

    std::string key("\x00\x01\x02\x03\x04\x05\x06\x07"
-                  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
-                  16);
+                    "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", 16);

    std::string cipher("\x69\xc4\xe0\xd8\x6a\x7b\x04\x30"
-                     "\xd8\xcd\xb7\x80\x70\xb4\xc5\x5a",
-                     16);
+                       "\xd8\xcd\xb7\x80\x70\xb4\xc5\x5a", 16);

    block p;

@@ -58,4 +57,31 @@ TEST(aes, base_test) {
    EXPECT_TRUE(equals(c, c_));
 }

+const size_t bench_size = 0x10000;
+
+block p[bench_size];
+
+block c[bench_size];
+
+TEST(aes, bench) {
+    std::string key("\x00\x01\x02\x03\x04\x05\x06\x07"
+                    "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", 16);
+
+    block k;
+
+    std::memcpy(&k, key.data(), key.size());
+
+    AES aes(k);
+
+    const size_t rep = 0x100;
+
+    auto t0 = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < rep; ++i) {
+        aes.ecb_enc_blocks(p, bench_size, c);
+    }
+    auto t1 = std::chrono::high_resolution_clock::now();
+    auto d = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0);
+    std::cerr << d.count() * 1.0 / (rep * bench_size) << " ns per op\n";
+}
+
 } // namespace psi
--- a/core/psi/net_io.h
+++ b/core/psi/net_io.h
@@ -138,6 +138,9 @@ public:
      if (ret < 0) {
        throw std::runtime_error("socket error: recv, errno: " +
                                 std::to_string(errno));
+      } else if (ret == 0) {
+        throw std::runtime_error("socket error: 0 byte recved, "
+                                 "socket shutdown by peer");
      }
      recved += ret;
    }

--- a/core/psi/prng.h
+++ b/core/psi/prng.h
@@ -23,38 +23,47 @@ namespace psi {
 class PseudorandomNumberGenerator {

 public:
+
    PseudorandomNumberGenerator() = default;

    PseudorandomNumberGenerator(const block &seed);

-  PseudorandomNumberGenerator(const PseudorandomNumberGenerator &other) =
-      delete;
+    PseudorandomNumberGenerator(
+        const PseudorandomNumberGenerator &other) = delete;

-  PseudorandomNumberGenerator &
-  operator=(const PseudorandomNumberGenerator &other) = delete;
+    PseudorandomNumberGenerator &operator=(
+        const PseudorandomNumberGenerator &other) = delete;

    void set_seed(const block &b);

-  template <typename T> T get() {
+    template <typename T>
+    T get() {
        T data;
        get_array(&data, sizeof(T));
        return data;
    }

-  void get_array(void *res, size_t len);
+    void get_array(void* res, size_t len);

    // for std::shuffle
    typedef uint64_t result_type;

-  constexpr static uint64_t min() { return 0; }
+    constexpr static uint64_t min() {
+        return 0;
+    }

-  constexpr static uint64_t max() { return -1ull; }
+    constexpr static uint64_t max() {
+        return -1ull;
+    }

-  uint64_t operator()() { return get<uint64_t>(); }
+    uint64_t operator()() {
+        return get<uint64_t>();
+    }

 private:
+
    // buffer num for aes cipher
-  static const size_t _s_buffer_size = 0x100;
+    static const size_t _s_buffer_size = 0x100000;

    static const size_t _s_byte_capacity = _s_buffer_size * sizeof(block);

@@ -71,3 +80,4 @@ private:
    void refill_buffer();
 };
 } // namespace psi
+
--- a/core/psi/psi_api.cc
+++ b/core/psi/psi_api.cc
@@ -298,24 +298,6 @@ const size_t PsiApi::_s_recv_step_len = 0x1000000;
 // default sync sock, no timeout
 int PsiApi::_s_timeout_s = 0;

-int get_err_code(const char *err) {
-  std::string s(err);
-  if (s.find("socket error: recv timeout") != std::string::npos) {
-    return SOCKET_TIMEOUT;
-  } else if (s.find("socket error") != std::string::npos) {
-    return SOCKET_ERROR;
-  } else if (s.find("openssl error") != std::string::npos) {
-    return OPENSSL_ERROR;
-  } else if (s.find("np ot error") != std::string::npos) {
-    return INTERNAL_ERROR;
-  } else if (s.find("ot ext error") != std::string::npos) {
-    return INTERNAL_ERROR;
-  } else if (s.find("psi error") != std::string::npos) {
-    return INTERNAL_ERROR;
-  }
-  return UNKNOWN_ERROR;
-}
-
 int psi_send(int port, const std::set<std::string> &in,
             std::atomic<int> *psi_progress) {
  try {
@@ -331,14 +313,9 @@ int psi_send(int port, const std::set<std::string> &in,
    if (psi_progress) {
      *psi_progress = -1;
    }
-    auto err = get_err_code(e.what());
-    if (err != UNKNOWN_ERROR) {
-      return err;
-    } else {
    throw;
  }
-  }
-  return PSI_OK;
+  return 0;
 }

 int psi_recv(const std::string &remote_ip, int port,
@@ -357,14 +334,9 @@ int psi_recv(const std::string &remote_ip, int port,
    if (psi_progress) {
      *psi_progress = -1;
    }
-    auto err = get_err_code(e.what());
-    if (err != UNKNOWN_ERROR) {
-      return err;
-    } else {
    throw;
  }
-  }
-  return PSI_OK;
+  return 0;
 }

 void set_psi_timeout(int timeout_s) { PsiApi::set_psi_timeout(timeout_s); }

--- a/core/psi/psi_api.h
+++ b/core/psi/psi_api.h
@@ -21,15 +21,6 @@

 namespace psi {

-enum PsiReturnCode {
-  PSI_OK = 0,
-  INTERNAL_ERROR = -1,
-  OPENSSL_ERROR = -2,
-  SOCKET_ERROR = -3,
-  SOCKET_TIMEOUT = -4,
-  UNKNOWN_ERROR = -5
-};
-
 int psi_send(int port, const std::set<std::string> &in,
             std::atomic<int> *psi_progress = nullptr);


--- a/core/psi/psi_api_test.cc
+++ b/core/psi/psi_api_test.cc
@@ -27,7 +27,6 @@ public:
    int _port;

    static const int _s_test_size = 1e3;
-
 public:
    PsiAPITest() {
        for (int i = 0; i < _s_test_size; ++i) {
@@ -42,8 +41,18 @@ public:
 TEST_F(PsiAPITest, full_test) {
    auto test_send = [this]() {
        // find valid port
-    for (int ret = SOCKET_ERROR; ret == SOCKET_ERROR; ++_port) {
-      ret = psi_send(_port, _input, nullptr);
+        for (;; ++_port) {
+            try {
+                psi_send(_port, _input, nullptr);
+                break;
+            } catch (const std::exception& e){
+                std::string s(e.what());
+                if (s.find("socket error") != std::string::npos) {
+                    continue;
+                } else {
+                    throw;
+                }
+            }
        }
    };
    auto t_send = std::thread(test_send);
@@ -56,7 +65,7 @@ TEST_F(PsiAPITest, full_test) {
    t_send.join();

    std::set<std::string> out_set;
-  for (auto &x : output) {
+    for (auto& x: output) {
        out_set.emplace(x);
    }
    ASSERT_EQ(out_set, _input);

--- a/core/psi/sse_transpose.cc
+++ b/core/psi/sse_transpose.cc
@@ -16,15 +16,13 @@

 #include <array>

-#include <wmmintrin.h>
-
 namespace psi {

-void sse_load_sub_square(std::array<block, 2> &out, std::array<block, 128> &in,
+void sse_load_sub_square(std::array<block, 2>& out, std::array<block, 128>& in,
                         size_t x, size_t y) {
-  std::array<std::array<uint8_t, 16>, 2> &out_byte_view =
+    std::array<std::array<uint8_t, 16>, 2>& out_byte_view =
        *reinterpret_cast<std::array<std::array<uint8_t, 16>, 2> *>(&out);
-  std::array<std::array<uint8_t, 16>, 128> &in_byte_view =
+    std::array<std::array<uint8_t, 16>, 128>& in_byte_view =
        *reinterpret_cast<std::array<std::array<uint8_t, 16>, 128> *>(&in);

    for (size_t l = 0; l < 16; l++) {
@@ -33,9 +31,9 @@ void sse_load_sub_square(std::array<block, 2> &out, std::array<block, 128> &in,
    }
 }

-void sse_transpose_sub_square(std::array<block, 128> &out,
-                              std::array<block, 2> &in, size_t x, size_t y) {
-  std::array<std::array<uint16_t, 8>, 128> &out_u16_view =
+void sse_transpose_sub_square(std::array<block, 128>& out,
+                              std::array<block, 2>& in, size_t x, size_t y) {
+    std::array<std::array<uint16_t, 8>, 128>& out_u16_view =
        *reinterpret_cast<std::array<std::array<uint16_t, 8>, 128> *>(&out);

    for (size_t j = 0; j < 8; j++) {
@@ -47,7 +45,7 @@ void sse_transpose_sub_square(std::array<block, 128> &out,
    }
 }

-void sse_transpose128(std::array<block, 128> &in_out) {
+void sse_transpose128(std::array<block, 128>& in_out) {
    std::array<block, 2> a, b;

    for (size_t j = 0; j < 8; j++) {

--- a/docs/source/_static/FL-framework.png
+++ b/docs/source/_static/FL-framework.png
--- a/docs/source/_static/PFM-design.png
+++ b/docs/source/_static/PFM-design.png
--- a/docs/source/_static/PFM-overview.png
+++ b/docs/source/_static/PFM-overview.png
--- a/docs/source/_static/fl_benchmark.png
+++ b/docs/source/_static/fl_benchmark.png
--- a/docs/source/_static/fl_dpsgd_benchmark.png
+++ b/docs/source/_static/fl_dpsgd_benchmark.png
--- a/docs/source/examples/md/dpsgd-example.md
+++ b/docs/source/examples/md/dpsgd-example.md
@@ -213,4 +213,4 @@ while not trainer.stop():

 To show the effectiveness of DPSGD-based federated learning with PaddleFL, a simulated experiment is conducted on an open source dataset MNIST. From the figure given below, model evaluation results are similar between DPSGD-based federated learning and traditional parameter server training when the overall privacy budget *epsilon* is 1.3 or 0.13. 

-<img src="fl_dpsgd_benchmark.png" height=400 width=600 hspace='10'/> <br />
+<img src="_static/fl_dpsgd_benchmark.png" height=400 width=600 hspace='10'/> <br />
--- a/docs/source/examples/md/gru4rec_examples.md
+++ b/docs/source/examples/md/gru4rec_examples.md
@@ -109,4 +109,4 @@ wget https://paddle-zwh.bj.bcebos.com/gru4rec_paddlefl_benchmark/gru4rec_benchma
 | 1/4 of the whole dataset | private training | - | 0.269 | 
 | 1/4 of the whole dataset | private training | - | 0.282 | 

-<img src="fl_benchmark.png" height=300 width=500 hspace='10'/> <br />
+<img src="_static/fl_benchmark.png" height=300 width=500 hspace='10'/> <br />
--- a/docs/source/examples/md/uci_demo.md
+++ b/docs/source/examples/md/uci_demo.md
 ## Instructions for PaddleFL-MPC UCI Housing Demo

-([简体中文](./README_CN.md)|English)
-
 This document introduces how to run UCI Housing demo based on Paddle-MPC, which has two ways of running, i.e., single machine and multi machines.

 ### 1. Running on Single Machine

--- a/docs/source/md/introduction.md
+++ b/docs/source/md/introduction.md
@@ -8,7 +8,8 @@ Data is becoming more and more expensive nowadays, and sharing of raw data is ve

 ## Overview of PaddleFL

-<img src='../../../images/FL-framework.png' width = "1000" height = "320" align="middle"/>
+<img src='_static/FL-framework.png' width = "1000" height = "320" align="middle"/>
+

 In PaddleFL, horizontal and vertical federated learning strategies will be implemented according to the categorization given in [4]. Application demonstrations in natural language processing, computer vision and recommendation will be provided in PaddleFL.

@@ -36,7 +37,7 @@ Besides, PFM is implemented based on secure multi-party computation (MPC) to ena

 ### Data Parallel

-<img src='images/FL-training.png' width = "1000" height = "400" align="middle"/>
+<img src='_static/FL-training.png' width = "1000" height = "400" align="middle"/>

 In Data Parallel, components for defining a federated learning task and training a federated learning job are as follows:

@@ -60,7 +61,7 @@ In Data Parallel, components for defining a federated learning task and training

 ### Federated Learning with MPC

-<img src='../../../images/PFM-overview.png' width = "1000" height = "446" align="middle"/>
+<img src='_static/PFM-overview.png' width = "1000" height = "446" align="middle"/>

 Paddle FL MPC implements secure training and inference tasks based on the underlying MPC protocol like ABY3[11], which is a high efficient three-party computing model.


--- a/python/paddle_fl/mpc/__init__.py
+++ b/python/paddle_fl/mpc/__init__.py
@@ -67,5 +67,7 @@ from . import data_utils
 from .io import *
 from .version import version
 from .layers import mpc_math_op_patch
+from . import input
+from . import initializer

 mpc_math_op_patch.monkey_patch_mpc_variable()
--- a/python/paddle_fl/mpc/backward.py
+++ b/python/paddle_fl/mpc/backward.py
@@ -30,6 +30,7 @@ from paddle.fluid import log_helper
 import paddle.fluid
 import paddle.fluid.backward as backward
 from .framework import is_mpc_parameter
+import mpc_data_utils as mdu

 _logger = log_helper.get_logger(
    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
@@ -40,7 +41,7 @@ def _create_loss_op_desc_(loss):
        "fill_constant", {},
        {"Out": [backward._append_grad_suffix_(loss.name)]}, {
            "shape": [2, 1],
-            "value": 21845,
+            "value": mdu.mpc_one_share,
            "dtype": loss.dtype,
            "force_cpu": False,
            core.op_proto_and_checker_maker.kOpRoleAttrName():

--- a/python/paddle_fl/mpc/data_utils/aby3.py
+++ b/python/paddle_fl/mpc/data_utils/aby3.py
@@ -21,6 +21,7 @@ import six
 import paddle
 import paddle.fluid as fluid
 import mpc_data_utils as mdu
+from ..layers import __all__ as all_ops

 __all__ = [
    'encrypt',
@@ -35,12 +36,22 @@ __all__ = [
    'decrypt_model',
 ]

+# operators that should be skipped when encrypt and decrypt
+op_to_skip = ['feed', 'fetch', 'scale', 'mpc_init']
+# operators that are supported currently for model encryption and decryption
+supported_mpc_ops = all_ops + ['fill_constant', 'sgd'] + op_to_skip
+# variables that used as plain variables and need no encryption
+plain_vars = ['learning_rate_0']
+
 SHARE_NUM = 3
 ABY3_SHARE_DIM = 2
 ABY3_MODEL_NAME = "__model__.aby3"
 MODEL_NAME = "__model__"
 MODEL_SHARE_DIR = "model_share"
 MPC_OP_PREFIX = "mpc_"
+# the MPC value of plain value 1, which is used for
+# default value of fill_constant OP
+MPC_ONE_SHARE = mdu.mpc_one_share


 def encrypt(number):
@@ -252,57 +263,144 @@ def batch(reader, batch_size, drop_last=False):
    return reshaped_batch_reader


-def encrypt_model(plain_model, mpc_model_dir, model_filename=None):
+def transpile(program=None):
    """
-    Encrypts model, and save to files for mpc inference.
+    Transpile Paddle program into MPC program.

    Args:
-        plain_model: The directory of paddle model.
-        mpc_model_dir: The directory that save mpc model shares.
-        model_filename: The name of model file.
+        program: The plain Paddle model program, default to
+        default_main_program.
+
+    Returns: The MPC program.
    """
+    if program is None:
+        program = fluid.default_main_program()
+
    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    [main_prog, _, _] = fluid.io.load_inference_model(
-        dirname=plain_model, executor=exe, model_filename=model_filename)
-    # TODO(xukun): support more blocks. Tips: may be just adding "for loop" for all blocks.
-    if main_prog.num_blocks > 1:
+    if program.num_blocks > 1:
        raise NotImplementedError(
            "The number of blocks in current main program"
            "is {}, which is not supported in this version."
-            .format(main_prog.num_blocks()))
-    global_block = main_prog.global_block()
+            .format(program.num_blocks()))
+
+    global_block = program.global_block()
    g_scope = fluid.global_scope()
-    for op in global_block.ops:
-        if op.type != "feed" and op.type != "fetch":
-            # TODO: needs to check if the mpc op exists
-            op.desc.set_type(MPC_OP_PREFIX + op.type)

-        for input_arg_name in op.input_arg_names:
-            var = global_block.var(input_arg_name)
+    mpc_vars_names = _transpile_type_and_shape(block=global_block)
+
+    # encrypt tensor values for each variable in mpc_var_names
+    for mpc_var_name in mpc_vars_names:
+        if g_scope.find_var(mpc_var_name) is not None:
+            param = g_scope.find_var(mpc_var_name)
+            param_tensor = np.array(param.get_tensor())
+            mpc_var = global_block.var(mpc_var_name)
+            if mpc_var_name not in plain_vars:
+                param.get_tensor()._set_dims(mpc_var.shape)
+                # process initialized params that should be 0
+                set_tensor_value = np.array([param_tensor, param_tensor]).astype(np.int64)
+                param.get_tensor().set(set_tensor_value, place)
+            else:
+                param.get_tensor().set(np.array(param.get_tensor()).astype('float64'), place)
+
+    # trigger sync to replace old ops.
+    op_num = global_block.desc.op_size()
+    _ = global_block.desc.append_op()
+    global_block.desc._remove_op(op_num, op_num + 1)
+    return program
+
+
+def _transpile_type_and_shape(block):
+    """
+    Transpile dtype and shape of plain variables into MPC dtype and shape.
+    And transpile op type into MPC type.
+
+    Args:
+        block: The block in Paddle program.
+
+    Returns: A set of variable names to encrypt.
+    """
+    mpc_vars_names = set()
+
+    # store variable name in mpc_vars_names, and encrypt dtype and shape
+    for var_name in block.vars:
+        var = block.var(var_name)
        if var.name != "feed" and var.name != "fetch":
+            mpc_vars_names.add(var.name)
+            if var_name in plain_vars:
+                var.desc.set_dtype(fluid.framework.convert_np_dtype_to_dtype_(np.float64))
+                continue
            # set mpc param shape = [2, old_shape]
-                encrypted_var_shape = (ABY3_SHARE_DIM, ) + var.shape
+            encrypted_var_shape = (ABY3_SHARE_DIM,) + var.shape
+            var.desc.set_dtype(fluid.framework.convert_np_dtype_to_dtype_(np.int64))
            var.desc.set_shape(encrypted_var_shape)

-                if g_scope.find_var(input_arg_name) is not None:
-                    param = g_scope.find_var(input_arg_name)
-                    param_tensor_shares = make_shares(
-                        np.array(param.get_tensor()))
+    # encrypt op type, or other attrs if needed
+    for op in block.ops:
+        if _is_supported_op(op.type):
+            if op.type == 'fill_constant':
+                op._set_attr(name='shape', val=(2L, 1L))
+                # set default MPC value for fill_constant OP
+                op._set_attr(name='value', val=MPC_ONE_SHARE)
+                op._set_attr(name='dtype', val=3)
+            elif op.type in op_to_skip:
+                pass
+            else:
+                op.desc.set_type(MPC_OP_PREFIX + op.type)
+        else:
+            raise NotImplementedError('Operator {} is unsupported.'
+                                      .format(op.type))
+    return mpc_vars_names
+

+def encrypt_model(program, mpc_model_dir=None, model_filename=None):
+    """
+    Encrypt model, and save encrypted model (i.e., MPC model shares) into
+    files for MPC training, updating, or inference.
+
+    Args:
+        program: The loaded program of paddle model.
+        mpc_model_dir: The directory that save MPC model shares.
+        model_filename: The name of MPC model file, default is __model__.aby3.
+    """
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # TODO(xukun): support more blocks. Tips: may just adding "for loop" for all blocks.
+    if program.num_blocks > 1:
+        raise NotImplementedError(
+            "The number of blocks in current main program"
+            "is {}, which is not supported in this version."
+            .format(program.num_blocks()))
+
+    global_block = program.global_block()
+    g_scope = fluid.global_scope()
+
+    mpc_vars_names = _transpile_type_and_shape(global_block)
+
+    # encrypt tensor values for each variable in mpc_var_names
+    for mpc_var_name in mpc_vars_names:
+        if g_scope.find_var(mpc_var_name) is not None:
+            param = g_scope.find_var(mpc_var_name)
+            param_tensor = np.array(param.get_tensor())
+            param_tensor_shares = make_shares(param_tensor)
+            mpc_var = global_block.var(mpc_var_name)
            for idx in six.moves.range(SHARE_NUM):
-                        param.get_tensor()._set_dims(encrypted_var_shape)
-                        param.get_tensor().set(
-                            get_aby3_shares(param_tensor_shares, idx), place)
+                if mpc_var_name not in plain_vars:
+                    param.get_tensor()._set_dims(mpc_var.shape)
+                    set_tensor_value = get_aby3_shares(param_tensor_shares, idx)
+                    param.get_tensor().set(set_tensor_value, place)
+                else:
+                    param.get_tensor().set(np.array(param.get_tensor()).astype('float64'), place)

                param_share_dir = os.path.join(
                    mpc_model_dir, MODEL_SHARE_DIR + "_" + str(idx))
                fluid.io.save_vars(
                    executor=exe,
                    dirname=param_share_dir,
-                            vars=[var],
-                            filename=input_arg_name)
-    # trigger sync to replace old ops
+                    vars=[mpc_var],
+                    filename=mpc_var_name)
+
+    # trigger sync to replace old ops.
    op_num = global_block.desc.op_size()
    _ = global_block.desc.append_op()
    global_block.desc._remove_op(op_num, op_num + 1)
@@ -317,17 +415,19 @@ def encrypt_model(plain_model, mpc_model_dir, model_filename=None):
            os.makedirs(model_share_dir)
        model_name = os.path.join(model_share_dir, model_basename)
        with open(model_name, "wb") as f:
-            f.write(main_prog.desc.serialize_to_string())
+            f.write(program.desc.serialize_to_string())


-def decrypt_model(mpc_model_dir, plain_model_path, model_filename=None):
+def decrypt_model(mpc_model_dir, plain_model_path, mpc_model_filename=None, plain_model_filename=None):
    """
-    Reveal a paddle model.
+    Reveal a paddle model. Load encrypted model (i.e., MPC model shares) from files and decrypt it
+    into paddle model.

    Args:
        mpc_model_dir: The directory of all model shares.
        plain_model_path: The directory to save revealed paddle model.
-        model_filename: The name of model file.
+        mpc_model_filename: The name of encrypted model file.
+        plain_model_filename: The name of decrypted model file.
    """
    share_dirs = []
    for sub_dir in os.listdir(mpc_model_dir):
@@ -337,7 +437,7 @@ def decrypt_model(mpc_model_dir, plain_model_path, model_filename=None):
    place = fluid.CPUPlace()
    exe = fluid.Executor(place=place)
    mpc_model_basename = os.path.basename(
-        model_filename) if model_filename is not None else ABY3_MODEL_NAME
+        mpc_model_filename) if mpc_model_filename is not None else ABY3_MODEL_NAME

    [main_prog, _, _] = fluid.io.load_inference_model(
        dirname=share_dirs[0], executor=exe, model_filename=mpc_model_basename)
@@ -349,38 +449,65 @@ def decrypt_model(mpc_model_dir, plain_model_path, model_filename=None):

    global_block = main_prog.global_block()
    g_scope = fluid.global_scope()
-    for op in global_block.ops:
-        # rename ops
-        if str(op.type).startswith(MPC_OP_PREFIX):
-            new_type = str(op.type)[len(MPC_OP_PREFIX):]
-            op.desc.set_type(new_type)

-        for input_arg_name in op.input_arg_names:
-            var = global_block.var(input_arg_name)
-            if var.name != "feed" and var.name != "fetch":
-                if var.shape[0] != ABY3_SHARE_DIM:
+    # a set storing unique variables to decrypt
+    vars_set = set()
+
+    # store variable name in vars_set, and decrypt dtype and shape
+    for mpc_var_name in global_block.vars:
+        mpc_var = global_block.var(mpc_var_name)
+        if mpc_var.name != "feed" and mpc_var.name != "fetch":
+            vars_set.add(mpc_var.name)
+            if mpc_var_name in plain_vars:
+                # var.desc.set_dtype(fluid.framework.convert_np_dtype_to_dtype_(np.float64))
+                continue
+            elif mpc_var.shape[0] != ABY3_SHARE_DIM:
                raise ValueError(
                    "Variable:{} shape: {} in saved model should start with 2."
-                        .format(var.name, var.shape))
-                plain_var_shape = var.shape[1:]
-                old_var_shape = var.shape
-                var.desc.set_shape(plain_var_shape)
-
-                if g_scope.find_var(input_arg_name) is not None:
-                    param = g_scope.find_var(input_arg_name)
+                        .format(mpc_var.name, mpc_var.shape))
+            else:
+                plain_var_shape = mpc_var.shape[1:]
+                mpc_var.desc.set_shape(plain_var_shape)
+                mpc_var.desc.set_dtype(fluid.framework.convert_np_dtype_to_dtype_(np.float32))
+
+    # remove init op
+    first_mpc_op = global_block.ops[0]
+    if first_mpc_op.type == 'mpc_init':
+        global_block._remove_op(0)
+
+    # decrypt op type, or other attrs if needed
+    for mpc_op in global_block.ops:
+        # rename ops
+        if str(mpc_op.type).startswith(MPC_OP_PREFIX):
+            new_type = str(mpc_op.type)[len(MPC_OP_PREFIX):]
+            mpc_op.desc.set_type(new_type)
+        elif mpc_op.type == 'fill_constant':
+            mpc_op._set_attr(name='shape', val=(1L))
+            mpc_op._set_attr(name='value', val=1.0)
+            mpc_op._set_attr(name='dtype', val=5)
+
+    # decrypt tensor values for each variable in vars_set
+    for var_name in vars_set:
+        var = global_block.var(var_name)
+        if g_scope.find_var(var_name) is not None:
+            param = g_scope.find_var(var_name)
+            if var_name in plain_vars:
+                pass
+            else:
                # reconstruct plaintext
                param_tensor_shares = _get_param_all_shares(
-                        input_arg_name, share_dirs, mpc_model_basename)
+                    var_name, share_dirs, mpc_model_basename)
                param_tensor = reconstruct(
                    param_tensor_shares, type=np.float32)
-                    param.get_tensor()._set_dims(plain_var_shape)
+                param.get_tensor()._set_dims(var.shape)
                param.get_tensor().set(param_tensor, place)

            fluid.io.save_vars(
                executor=exe,
                dirname=plain_model_path,
                vars=[var],
-                        filename=input_arg_name)
+                filename=var_name)
+
    # trigger sync to replace old ops
    op_num = global_block.desc.op_size()
    _ = global_block.desc.append_op()
@@ -388,7 +515,7 @@ def decrypt_model(mpc_model_dir, plain_model_path, model_filename=None):

    # save plaintext model file.
    model_basename = os.path.basename(
-        model_filename) if model_filename is not None else MODEL_NAME
+        plain_model_filename) if plain_model_filename is not None else MODEL_NAME
    if not os.path.exists(plain_model_path):
        os.makedirs(plain_model_path)
    model_name = os.path.join(plain_model_path, model_basename)
@@ -404,7 +531,9 @@ def _get_param_all_shares(param_name, share_dirs, model_file):
        param_name: The name of parameter.
        share_dirs: The directories which storing model shares.
        model_file: The name of model file.
-    :return:
+
+    Returns:
+        ndarray. The loaded shares.
    """
    exe = fluid.Executor(place=fluid.CPUPlace())
    param_shares = []
@@ -416,3 +545,152 @@ def _get_param_all_shares(param_name, share_dirs, model_file):
        param_tensor = np.array(param.get_tensor())
        param_shares.append(param_tensor)
    return np.array(param_shares, dtype=np.int64)
+
+
+def _is_supported_op(op_name):
+    """
+    Check if op is supported for encryption and decryption.
+
+    Args:
+        op_name: The name of op.
+
+    Returns:
+        True if supported.
+    """
+    if op_name not in supported_mpc_ops:
+        if str(op_name).endswith('_grad'):
+            _is_supported_op(str(op_name)[:-5])
+        else:
+            return False
+    return True
+
+
+def load_mpc_model(exe, mpc_model_dir, mpc_model_filename, inference=False):
+    """
+    Load MPC model from files. The loaded program of the model would be inserted
+    init OP and then switched to default_main_program for further MPC procedure.
+
+    Args:
+        exe: The executor used for loading.
+        mpc_model_dir: The directory of MPC model.
+        mpc_model_filename: The filename of MPC model.
+        inference: Whether the model to load is used for inference. If true, the
+        model to load should be an inference model, and feed_name, fetch_targets
+        would be returned with the loaded program after inserting init OP. Otherwise,
+        after inserting init OP, the loaded program would be switched to
+        default_main_program and returned. Default value is False.
+
+    Returns:
+        default_main_program if inference is False. Otherwise, default_main_program,
+        feed_name, and fetch_targets would be returned.
+    """
+    mpc_program, feed_names, fetch_targets = fluid.io.load_inference_model(executor=exe,
+                                  dirname=mpc_model_dir,
+                                  model_filename=mpc_model_filename)
+    # find init OP
+    global_block = fluid.default_main_program().global_block()
+    init_op_idx = _find_init_op_idx(global_block)
+    if init_op_idx < 0:
+        raise RuntimeError('No mpc_init op in global block, '
+                           'maybe you should use paddle_fl.mpc.init() first.')
+    init_op = global_block.ops[init_op_idx]
+    # find the last feed OP for inserting init OP
+    last_feed_op_idx = _find_last_feed_op_idx(mpc_program.global_block())
+    # insert init OP as the first OP of MPC program if no feed OP,
+    # otherwise, insert it after the last feed OP.
+    insert_idx = 0 if last_feed_op_idx < 0 else last_feed_op_idx + 1
+    loaded_mpc_program = _insert_init_op(main_prog=mpc_program,
+                                         init_op=init_op,
+                                         index=insert_idx)
+    if inference:
+        return loaded_mpc_program, feed_names, fetch_targets
+    else:
+        # switch loaded_mpc_program to default_main_program
+        fluid.framework.switch_main_program(loaded_mpc_program)
+        return fluid.default_main_program()
+
+
+def _find_init_op_idx(block):
+    """
+    Find the index of mpc_init op.
+
+    Args:
+        block: The block of program.
+
+    Returns:
+        The index of mpc_init op.
+    """
+    for idx, op in enumerate(block.ops):
+        if op.type == 'mpc_init':
+            return idx
+    return -1
+
+
+def _find_last_feed_op_idx(block):
+    """
+    Find the index of the last feed OP.
+
+    Args:
+        block: The block of program.
+
+    Returns:
+        The index of the last feed OP.
+    """
+    feed_idx = -1
+    for idx, op in enumerate(block.ops):
+        if op.type == 'feed':
+            feed_idx = idx
+    return feed_idx
+
+
+def save_trainable_model(exe, model_dir, model_filename=None, program=None):
+    """
+    Save trainable model, which includes saving program and
+    persistable parameters into files. The saved model can be
+    loaded by fluid.io.load_inference_model for further training
+    or updating.
+
+    Args:
+        exe: The executor used for saving.
+        model_dir: The directory of model to save.
+        model_filename: The filename of model to save.
+        program: The program to save, default to default_main_program.
+
+    TODO: can move this to paddle_mpc/python/paddle_fl/mpc/io.py
+    """
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    model_basename = os.path.basename(
+        model_filename) if model_filename is not None else ABY3_MODEL_NAME
+    # save program
+    model_name = os.path.join(model_dir, model_basename)
+    if program is None:
+        program = fluid.default_main_program()
+    with open(model_name, "wb") as f:
+        f.write(program.desc.serialize_to_string())
+    # save parameters
+    fluid.io.save_persistables(executor=exe,
+                               dirname=model_dir,
+                               main_program=program)
+
+
+def _insert_init_op(main_prog, init_op, index):
+    """
+    Insert init OP into main_prog according to the index.
+
+    Args:
+        main_prog: The program to insert init OP.
+        init_op: The init OP for MPC running.
+        index: The place that the init_op to insert.
+
+    Returns:
+        The program after inserting init OP.
+    """
+    main_prog.global_block()._sync_with_cpp()
+    op_desc = main_prog.global_block().desc._insert_op(index)
+    mpc_init_op = fluid.framework.Operator(block=main_prog.global_block(),
+                                           desc=op_desc,
+                                           type=init_op.type,
+                                           attrs=init_op.all_attrs())
+    main_prog.global_block().ops.insert(index, mpc_init_op)
+    return main_prog
--- a/python/paddle_fl/mpc/data_utils/alignment.py
+++ b/python/paddle_fl/mpc/data_utils/alignment.py
@@ -15,10 +15,11 @@
 This module provide data alignment tools, implemented by OT (Oblivious Transfer)-based
 PSI (Private Set Intersection) algorithm.
 """
+from multiprocessing.connection import Client, Listener
 import os
 import sys
 import mpc_data_utils as mdu
-from multiprocessing.connection import Client, Listener
+
 __all__ = ['align', ]



--- a/python/paddle_fl/mpc/examples/mnist_demo/README.md
+++ b/python/paddle_fl/mpc/examples/mnist_demo/README.md
@@ -20,6 +20,15 @@ Encrypted data files of feature and label would be generated and saved in `/tmp`

 #### (2). Launch Demo with A Shell Script

+You should set the env params as follow:
+
+```
+export PYTHON=/yor/python
+export PATH_TO_REDIS_BIN=/path/to/redis_bin
+export LOCALHOST=/your/localhost
+export REDIS_PORT=/your/redis/port
+```
+
 Launch demo with the `run_standalone.sh` script. The concrete command is:

 ```bash
@@ -32,11 +41,14 @@ Besides, predictions would be made in this demo once training is finished. The p

 #### (3). Decrypt Data

-Decrypt the saved prediction data and save the decrypted prediction results into a specified file using `decrypt_data_to_file()` in `process_data.py` script. For example, users can write the following code into a python script named `decrypt_save.py`, and then run the script with command `python decrypt_save.py`. The decrypted prediction results would be saved into `mpc_label`.
+Decrypt the saved prediction data and save the decrypted prediction results into a specified file using `decrypt_data_to_file()` in `process_data.py` script. For example, users can write the following code into a python script named `decrypt_save.py`, and then run the script with command `python decrypt_save.py decrypt_file`. The decrypted prediction results would be saved into `decrypt_file`.

 ```python
+import sys
+
+decrypt_file=sys.argv[1]
 import process_data
-process_data.decrypt_data_to_file("/tmp/mnist_output_prediction", (BATCH_SIZE,), "mpc_label")
+process_data.decrypt_data_to_file("/tmp/mnist_output_prediction", (BATCH_SIZE,), decrypt_file)
 ```

 **Note** that remember to delete the prediction files in `/tmp` directory generated in last running, in case of any influence on the decrypted results of current running. For simplifying users operations, we provide the following commands in `run_standalone.sh`, which can delete the files mentioned above when running this script.
@@ -91,10 +103,13 @@ Similarly, predictions with cypher text format would be saved in `/tmp` director

 #### (5). Decrypt Prediction Data

-Each computation party sends  `mnist_output_prediction.part` file in `/tmp` directory to the `/tmp` directory of data owner. Data owner decrypts the prediction data and saves the decrypted prediction results into a specified file using `decrypt_data_to_file()` in `process_data.py` script. For example, users can write the following code into a python script named `decrypt_save.py`, and then run the script with command `python decrypt_save.py`. The decrypted prediction results would be saved into file `mpc_label`.
+Each computation party sends  `mnist_output_prediction.part` file in `/tmp` directory to the `/tmp` directory of data owner. Data owner decrypts the prediction data and saves the decrypted prediction results into a specified file using `decrypt_data_to_file()` in `process_data.py` script. For example, users can write the following code into a python script named `decrypt_save.py`, and then run the script with command `python decrypt_save.py decrypt_file`. The decrypted prediction results would be saved into file `decrypt_file`.

 ```python
+import sys
+
+decrypt_file=sys.argv[1]
 import process_data
-process_data.decrypt_data_to_file("/tmp/mnist_output_prediction", (BATCH_SIZE,), "mpc_label")
+process_data.decrypt_data_to_file("/tmp/mnist_output_prediction", (BATCH_SIZE,), decrypt_file)
 ```

--- a/python/paddle_fl/mpc/examples/mnist_demo/README_CN.md
+++ b/python/paddle_fl/mpc/examples/mnist_demo/README_CN.md
@@ -20,7 +20,16 @@ process_data.generate_encrypted_test_data()

 #### 2. 使用shell脚本启动demo

-使用`run_standalone.sh`脚本，启动并运行demo，命令如下：
+运行demo之前，需设置以下环境变量：
+
+```
+export PYTHON=/yor/python
+export PATH_TO_REDIS_BIN=/path/to/redis_bin
+export LOCALHOST=/your/localhost
+export REDIS_PORT=/your/redis/port
+```
+
+然后使用`run_standalone.sh`脚本，启动并运行demo，命令如下：

 ```bash 
 bash run_standalone.sh mnist_demo.py
@@ -32,11 +41,14 @@ bash run_standalone.sh mnist_demo.py

 #### 3. 解密数据

-使用`process_data.py`脚本中的`decrypt_data_to_file()`，将保存的密文预测结果进行解密，并且将解密得到的明文预测结果保存到指定文件中。例如，将下面的内容写到一个`decrypt_save.py`脚本中，然后`python decrypt_save.py`，将把明文预测结果保存在`mpc_label`文件中。
+使用`process_data.py`脚本中的`decrypt_data_to_file()`，将保存的密文预测结果进行解密，并且将解密得到的明文预测结果保存到指定文件中。例如，将下面的内容写到一个`decrypt_save.py`脚本中，然后`python decrypt_save.py decrypt_file`，将把明文预测结果保存在`decrypt_file`文件中。

 ```python
+import sys
+
+decrypt_file=sys.argv[1]
 import process_data
-process_data.decrypt_data_to_file("/tmp/mnist_output_prediction", (BATCH_SIZE,), "mpc_label")
+process_data.decrypt_data_to_file("/tmp/mnist_output_prediction", (BATCH_SIZE,), decrypt_file)
 ```

 **注意**：再次启动运行demo之前，请先将上次在`/tmp`保存的预测密文结果文件删除，以免影响本次密文数据的恢复结果。为了简化用户操作，我们在`run_standalone.sh`脚本中加入了如下的内容，可以在执行脚本时删除上次的数据。
@@ -93,10 +105,13 @@ $PYTHON_EXECUTABLE mnist_demo.py $PARTY_ID $SERVER $PORT

 #### 5. 解密预测数据

-各计算party将`/tmp`目录下的`mnist_output_prediction.part`文件发送到数据方的/tmp目录下。数据方使用`process_data.py`脚本中的`decrypt_data_to_file()`，将密文预测结果进行解密，并且将解密得到的明文预测结果保存到指定文件中。例如，将下面的内容写到一个`decrypt_save.py`脚本中，然后`python decrypt_save.py`，将把明文预测结果保存在`mpc_label`文件中。
+各计算party将`/tmp`目录下的`mnist_output_prediction.part`文件发送到数据方的/tmp目录下。数据方使用`process_data.py`脚本中的`decrypt_data_to_file()`，将密文预测结果进行解密，并且将解密得到的明文预测结果保存到指定文件中。例如，将下面的内容写到一个`decrypt_save.py`脚本中，然后`python decrypt_save.py decrypt_file`，将把明文预测结果保存在`decrypt_file`文件中。

 ```python
+import sys
+
+decrypt_file=sys.argv[1]
 import process_data
-process_data.decrypt_data_to_file("/tmp/mnist_output_prediction", (BATCH_SIZE,), "mpc_label")
+process_data.decrypt_data_to_file("/tmp/mnist_output_prediction", (BATCH_SIZE,), decrypt_file)
 ```

--- a/python/paddle_fl/mpc/examples/mnist_demo/decrypt_save.py
+++ b/python/paddle_fl/mpc/examples/mnist_demo/decrypt_save.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Decrypt Prediction Data.
+"""
+import sys
+
+import process_data
+
+decrypt_file=sys.argv[1]
+BATCH_SIZE=128
+process_data.decrypt_data_to_file("/tmp/mnist_output_prediction", (BATCH_SIZE,), decrypt_file)
--- a/python/paddle_fl/mpc/examples/mnist_demo/mnist_demo.py
+++ b/python/paddle_fl/mpc/examples/mnist_demo/mnist_demo.py
@@ -16,6 +16,7 @@ MNIST Demo
 """

 import sys
+import os

 import numpy as np
 import time
@@ -78,18 +79,20 @@ test_loader.set_batch_generator(test_batch_sample, places=place)
 exe = fluid.Executor(place)
 exe.run(fluid.default_startup_program())

-start_time = time.time()
-step = 0
 for epoch_id in range(epoch_num):
+    start_time = time.time()
+    step = 0
    # feed data via loader
    for sample in loader():
+        batch_start = time.time()
        exe.run(feed=sample, fetch_list=[cost.name])
+        batch_end = time.time()
        if step % 50 == 0:
-            print('Epoch={}, Step={}'.format(epoch_id, step))
+            print('Epoch={}, Step={}, batch_cost={:.4f} s'.format(epoch_id, step, (batch_end - batch_start)))
        step += 1

-end_time = time.time()
-print('Mpc Training of Epoch={} Batch_size={}, cost time in seconds:{}'
+    end_time = time.time()
+    print('Mpc Training of Epoch={} Batch_size={}, epoch_cost={:.4f} s'
      .format(epoch_num, BATCH_SIZE, (end_time - start_time)))

 # prediction

--- a/python/paddle_fl/mpc/examples/mnist_demo/prepare.py
+++ b/python/paddle_fl/mpc/examples/mnist_demo/prepare.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Prepare data for MNIST.
+"""
+import process_data
+
+
+process_data.generate_encrypted_data()
+process_data.generate_encrypted_test_data()
--- a/python/paddle_fl/mpc/examples/mnist_demo/process_data.py
+++ b/python/paddle_fl/mpc/examples/mnist_demo/process_data.py
@@ -17,6 +17,7 @@ Process data for MNIST.
 import numpy as np
 import paddle
 import six
+import os
 from paddle_fl.mpc.data_utils import aby3

 sample_reader = paddle.dataset.mnist.train()
@@ -77,10 +78,12 @@ def load_decrypt_data(filepath, shape):
        p = aby3.reconstruct(np.array(instance))
        print(p)

-def decrypt_data_to_file(filepath, shape, decrypted_filepath):
+def decrypt_data_to_file(filepath, shape, decrypted_file):
    """
    load the encrypted data and reconstruct to a file
    """
+    if os.path.exists(decrypted_file):
+        os.remove(decrypted_file)
    part_readers = []
    for id in six.moves.range(3):
        part_readers.append(aby3.load_aby3_shares(filepath, id=id, shape=shape))
@@ -88,6 +91,6 @@ def decrypt_data_to_file(filepath, shape, decrypted_filepath):

    for instance in aby3_share_reader():
        p = aby3.reconstruct(np.array(instance))
-        with open(decrypted_filepath, 'a+') as f:
+        with open(decrypted_file, 'a+') as f:
            for i in p:
                f.write(str(i) + '\n')
--- a/python/paddle_fl/mpc/examples/mnist_demo/run_standalone.sh
+++ b/python/paddle_fl/mpc/examples/mnist_demo/run_standalone.sh
@@ -31,12 +31,13 @@
 # bash run_standalone.sh TEST_SCRIPT_NAME
 #

-# modify the following vars according to your environment
-PYTHON="python"
-REDIS_HOME="path_to_redis_bin"
-SERVER="localhost"
-PORT=9937
+# please set the following environment vars according in your environment
+PYTHON=${PYTHON}
+REDIS_HOME=${PATH_TO_REDIS_BIN}
+SERVER=${LOCALHOST}
+PORT=${REDIS_PORT}

+echo "redis home in ${REDIS_HOME}, server is ${SERVER}, port is ${PORT}"
 function usage() {
    echo 'run_standalone.sh SCRIPT_NAME [ARG...]'
    exit 0
@@ -63,10 +64,21 @@ $REDIS_BIN -h $SERVER -p $PORT flushall

 # remove temp data generated in last time
 PRED_FILE="/tmp/mnist_output_prediction.*"
-if [ "$PRED_FILE" ]; then
+ls ${PRED_FILE}
+if [ $? -eq 0 ]; then
        rm -rf $PRED_FILE
 fi

+TRAINING_FILE="/tmp/mnist2_feature.part*"
+ls ${TRAINING_FILE}
+if [ $? -ne 0 ]; then
+    echo "There is no data in /tmp, please prepare data with "python prepare.py" firstly"
+    exit 1
+else
+    echo "There are data for mnist:"
+    echo "`ls ${TRAINING_FILE}`"
+fi
+

 # kick off script with roles of 1 and 2, and redirect output to /dev/null
 for role in {1..2}; do

--- a/python/paddle_fl/mpc/examples/psi_demo/README.md
+++ b/python/paddle_fl/mpc/examples/psi_demo/README.md
+## Data Alignment Tool
+
+This is an example of using the `alignment` function to build a command line tool of PSI (Private Set Intersection).
+
+### Usage
+
+```bash
+python align.py --party_id=$PARTY_ID --endpoints=$END_POINTS --data_file=$FILE_NAME [--is_receiver]
+```  
+### Example
+
+Take data alignment between two parties , e.g., Alice (whose party_id is 0, IP address is 'A.A.A.A', port is 11111) and Bob (whose party_id is 1, IP address is 'B.B.B.B', port is 22222), as an example. Alice and Bob would like to find the intersection of alice_data.txt and bob_data.txt respectively, and Bob is intended to receive the final result. 
+
+On each party:
+
+*  **Alice**
+
+```bash
+python align.py --party_id=0 --endpoints=0:A.A.A.A:11111,1:B.B.B.B:22222 --data_file=alice_data.txt
+```
+
+*  **Bob**
+
+```bash
+python align.py --party_id=1 --endpoints=0:A.A.A.A:11111,1:B.B.B.B:22222 --data_file=bob_data.txt --is_receiver
--- a/python/paddle_fl/mpc/examples/psi_demo/align.py
+++ b/python/paddle_fl/mpc/examples/psi_demo/align.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Data alignment.
+"""
+import argparse
+import paddle_fl.mpc.data_utils.alignment as alignment
+
+
+def parse_args():
+    """
+    Parse arguments.
+    """
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--party_id", type=int, help="the id of this party")
+    parser.add_argument("--endpoints", type=str,
+                        default='0:127.0.0.1:11111,1:127.0.0.1:22222',
+                        help="id:ip:port info")
+    parser.add_argument("--data_file", type=str, help="data file")
+    parser.add_argument("--is_receiver", action='store_true', help="whether is receiver")
+    args = parser.parse_args()
+    return args
+
+
+def do_align(args):
+    """
+    Do alignment.
+    """
+    # read data from file
+    input_set = set()
+    for line in open(args.data_file, 'r'):
+        input_set.add(line.strip())
+    # do alignment
+    result = alignment.align(input_set=input_set,
+                             party_id=args.party_id,
+                             endpoints=args.endpoints,
+                             is_receiver=args.is_receiver)
+    return result
+
+
+if __name__ == '__main__':
+    # use signal for interrupt from keyboard
+    import signal
+    signal.signal(signal.SIGINT, signal.SIG_DFL)
+
+    args = parse_args()
+    print('ARGUMENTS: party_id={}, endpoints={}, is_receiver={}, data_file={}'
+          .format(args.party_id, args.endpoints, args.is_receiver, args.data_file))
+    align_rst = do_align(args)
+    print("Alignment result is: {}".format(align_rst))
--- a/python/paddle_fl/mpc/examples/uci_demo/README.md
+++ b/python/paddle_fl/mpc/examples/uci_demo/README.md
@@ -19,18 +19,44 @@ Encrypted data files of feature and label would be generated and saved in `/tmp`

 #### (2). Launch Demo with A Shell Script

+You should set the env params as follow:
+
+```
+export PYTHON=/yor/python
+export PATH_TO_REDIS_BIN=/path/to/redis_bin
+export LOCALHOST=/your/localhost
+export REDIS_PORT=/your/redis/port
+```
+
 Launch demo with the `run_standalone.sh` script. The concrete command is:

 ```bash
-bash run_standalone.sh uci_housing_demo.py
+bash run_standalone.sh uci_demo.py
 ```

 The loss with cypher text format will be displayed on screen while training. At the same time, the loss data would be also save in `/tmp` directory, and the format of file name is similar to what is described in Step 1.

 Besides, predictions would be made in this demo once training is finished. The predictions with cypher text format would also be save in `/tmp` directory.

+#### (3). Decrypt Data
+
 Finally, using `load_decrypt_data()` in `process_data.py` script, this demo would decrypt and print the loss and predictions, which can be compared with related results of Paddle plain text model.

+For example, users can write the following code into a python script named `decrypt_save.py`, and then run the script with command `python decrypt_save.py decrypt_loss_file decrypt_prediction_file`. The decrypted loss and prediction results would be saved into two files correspondingly.
+
+```python
+import sys
+
+import process_data
+
+
+decrypt_loss_file=sys.argv[1]
+decrypt_prediction_file=sys.argv[2]
+BATCH_SIZE=10
+process_data.load_decrypt_data("/tmp/uci_loss", (1, ), decrypt_loss_file)
+process_data.load_decrypt_data("/tmp/uci_prediction", (BATCH_SIZE, ), decrypt_prediction_file)
+```
+
 **Note** that remember to delete the loss and prediction files in `/tmp` directory generated in last running, in case of any influence on the decrypted results of current running. For simplifying users operations, we provide the following commands in `run_standalone.sh`, which can delete the files mentioned above when running this script.

 ```bash
@@ -58,9 +84,9 @@ Data owner encrypts data. Concrete operations are consistent with “Prepare Dat

 According to the suffix of file name, distribute encrypted data files to `/tmp ` directories of all 3 computation parties. For example, send `house_feature.part0` and `house_label.part0` to `/tmp` of party 0 with `scp` command.

-#### (3). Modify uci_housing_demo.py
+#### (3). Modify uci_demo.py

-Each computation party makes the following modifications on `uci_housing_demo.py` according to the environment of machine.
+Each computation party makes the following modifications on `uci_demo.py` according to the environment of machine.

 * Modify IP Information

@@ -70,18 +96,6 @@ Each computation party makes the following modifications on `uci_housing_demo.py
  pfl_mpc.init("aby3", int(role), "localhost", server, int(port))
  ```

-* Comment Out Codes for Single Machine Running
-
-  Comment out the following codes which are used when running on single machine.
-
-  ```python
-  import process_data
-  print("uci_loss:")
-  process_data.load_decrypt_data("/tmp/uci_loss", (1,))
-  print("prediction:")
-  process_data.load_decrypt_data("/tmp/uci_prediction", (BATCH_SIZE,))
-  ```
-
 #### (4). Launch Demo on Each Party

 **Note** that Redis service is necessary for demo running. Remember to clear the cache of Redis server before launching demo on each computation party, in order to avoid any negative influences caused by the cached records in Redis. The following command can be used for clear Redis, where REDIS_BIN is the executable binary of redis-cli, SERVER and PORT represent the IP and port of Redis server respectively.
@@ -93,7 +107,7 @@ $REDIS_BIN -h $SERVER -p $PORT flushall
 Launch demo on each computation party with the following command,

 ```
-$PYTHON_EXECUTABLE uci_housing_demo.py $PARTY_ID $SERVER $PORT
+$PYTHON_EXECUTABLE uci_demo.py $PARTY_ID $SERVER $PORT
 ```

 where PYTHON_EXECUTABLE is the python which installs PaddleFL, PARTY_ID is the ID of computation party, which is 0, 1, or 2, SERVER and PORT represent the IP and port of Redis server respectively.
@@ -106,20 +120,19 @@ Similarly, training loss with cypher text format would be printed on the screen

 Each computation party sends `uci_loss.part` and `uci_prediction.part` files in `/tmp` directory to the `/tmp` directory of data owner. Data owner decrypts and gets the plain text of loss and predictions with ` load_decrypt_data()` in `process_data.py`.

-For example, the following code can be written into a python script to decrypt and print training loss.
+For example, the following code can be written into a python script to decrypt and print training loss and predictions.

 ```python
+import sys
+
 import process_data
-print("uci_loss:")
-process_data.load_decrypt_data("/tmp/uci_loss", (1,))
-```

-And the following code can be written into a python script to decrypt and print predictions.

-```python
-import process_data
-print("prediction:")
-process_data.load_decrypt_data("/tmp/uci_prediction", (BATCH_SIZE,))
+decrypt_loss_file=sys.argv[1]
+decrypt_prediction_file=sys.argv[2]
+BATCH_SIZE=10
+process_data.load_decrypt_data("/tmp/uci_loss", (1, ), decrypt_loss_file)
+process_data.load_decrypt_data("/tmp/uci_prediction", (BATCH_SIZE, ), decrypt_prediction_file)
 ```

 ### 3. Convergence of paddle_fl.mpc vs paddle

--- a/python/paddle_fl/mpc/examples/uci_demo/README_CN.md
+++ b/python/paddle_fl/mpc/examples/uci_demo/README_CN.md
@@ -19,17 +19,42 @@ process_data.generate_encrypted_data()

 #### 2. 使用shell脚本启动demo

-使用`run_standalone.sh`脚本，启动并运行demo，命令如下：
+运行demo之前，需设置以下环境变量：
+
+```
+export PYTHON=/yor/python
+export PATH_TO_REDIS_BIN=/path/to/redis_bin
+export LOCALHOST=/your/localhost
+export REDIS_PORT=/your/redis/port
+```
+
+然后使用`run_standalone.sh`脚本，启动并运行demo，命令如下：

 ```bash 
-bash run_standalone.sh uci_housing_demo.py
+bash run_standalone.sh uci_demo.py
 ```

 运行之后将在屏幕上打印训练过程中的密文loss数据，同时，对应的密文loss数据将会保存到/tmp目录下的文件中，文件命名格式类似于步骤1中所述。

 此外，在完成训练之后，demo会继续进行预测，并将预测密文结果也保存到/tmp目录下的文件中。

+#### 3. 解密数据
+
 最后，demo会使用`process_data.py`脚本中的`load_decrypt_data()`，恢复并打印出明文的loss数据和prediction结果，用以和明文Paddle模型结果进行对比。
+例如，将下面的内容写到一个decrypt_save.py脚本中，然后python decrypt_save.py decrypt_loss_file decrypt_prediction_file，将把明文losss数据和预测结果分别保存在文件中。
+
+```python
+import sys
+
+import process_data
+
+
+decrypt_loss_file=sys.argv[1]
+decrypt_prediction_file=sys.argv[2]
+BATCH_SIZE=10
+process_data.load_decrypt_data("/tmp/uci_loss", (1, ), decrypt_loss_file)
+process_data.load_decrypt_data("/tmp/uci_prediction", (BATCH_SIZE, ), decrypt_prediction_file)
+```

 **注意**：再次启动运行demo之前，请先将上次在`/tmp`保存的loss和prediction文件删除，以免影响本次密文数据的恢复结果。为了简化用户操作，我们在`run_standalone.sh`脚本中加入了如下的内容，可以在执行脚本时删除上次数据。

@@ -60,9 +85,9 @@ fi

 `house_feature.part0`和`house_label.part0`发送到party0的/tmp目录下。

-#### 3. 计算party修改uci_housing_demo.py脚本
+#### 3. 计算party修改uci_demo.py脚本

-各计算party根据自己的机器环境，对uci_housing_demo.py做如下改动：
+各计算party根据自己的机器环境，对uci_demo.py做如下改动：

 * 修改IP信息

@@ -72,17 +97,6 @@ fi
  pfl_mpc.init("aby3", int(role), "localhost", server, int(port))
  ```

-* 注释掉单机运行所需代码
-
-  将脚本中如下代码注释掉，这部分代码用在单机运行case下。
-
-  ```python
-  import process_data
-  print("uci_loss:")
-  process_data.load_decrypt_data("/tmp/uci_loss", (1,))
-  print("prediction:")
-  process_data.load_decrypt_data("/tmp/uci_prediction", (BATCH_SIZE,))
-  ```

 #### 4. 各计算party启动demo

@@ -95,7 +109,7 @@ $REDIS_BIN -h $SERVER -p $PORT flushall
 在各计算party分别执行以下命令，启动demo：

 ```
-$PYTHON_EXECUTABLE uci_housing_demo.py $PARTY_ID $SERVER $PORT
+$PYTHON_EXECUTABLE uci_demo.py $PARTY_ID $SERVER $PORT
 ```

 其中，PYTHON_EXECUTABLE表示自己安装了PaddleFL的python，PARTY_ID表示计算party的编号，值为0、1或2，SERVER和PORT分别表示redis server的IP地址和端口号。
@@ -108,20 +122,19 @@ $PYTHON_EXECUTABLE uci_housing_demo.py $PARTY_ID $SERVER $PORT

 各计算party将`/tmp`目录下的`uci_loss.part`和`uci_prediction.part`文件发送到数据方的/tmp目录下。数据方使用process_data.py脚本中的load_decrypt_data()解密恢复出loss数据和prediction数据。

-比如，使用如下内容的python脚本，打印解密的loss数据：
+例如，将下面的内容写到一个decrypt_save.py脚本中，然后python decrypt_save.py decrypt_loss_file decrypt_prediction_file，将把明文losss数据和预测结果分别保存在文件中。

 ```python
+import sys
+
 import process_data
-print("uci_loss:")
-process_data.load_decrypt_data("/tmp/uci_loss", (1,))
-```

-使用如下内容的python脚本，打印解密的prediction数据：

-```python
-import process_data
-print("prediction:")
-process_data.load_decrypt_data("/tmp/uci_prediction", (BATCH_SIZE,))
+decrypt_loss_file=sys.argv[1]
+decrypt_prediction_file=sys.argv[2]
+BATCH_SIZE=10
+process_data.load_decrypt_data("/tmp/uci_loss", (1, ), decrypt_loss_file)
+process_data.load_decrypt_data("/tmp/uci_prediction", (BATCH_SIZE, ), decrypt_prediction_file)
 ```

 ### 三. 单机精度测试

--- a/python/paddle_fl/mpc/examples/uci_demo/decrypt_save.py
+++ b/python/paddle_fl/mpc/examples/uci_demo/decrypt_save.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Decrypt Prediction Data.
+"""
+import sys
+
+import process_data
+
+
+decrypt_loss_file=sys.argv[1]
+decrypt_prediction_file=sys.argv[2]
+BATCH_SIZE=10
+process_data.load_decrypt_data("/tmp/uci_loss", (1, ), decrypt_loss_file)
+print("uci_loss done")
+process_data.load_decrypt_data("/tmp/uci_prediction", (BATCH_SIZE, ), decrypt_prediction_file)
+print("prediction done")
--- a/python/paddle_fl/mpc/examples/uci_demo/prepare.py
+++ b/python/paddle_fl/mpc/examples/uci_demo/prepare.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Prepare data for UCI Housing.
+"""
+import process_data
+
+
+process_data.generate_encrypted_data()
--- a/python/paddle_fl/mpc/examples/uci_demo/process_data.py
+++ b/python/paddle_fl/mpc/examples/uci_demo/process_data.py
@@ -17,6 +17,7 @@ Process data for UCI Housing.
 import numpy as np
 import paddle
 import six
+import os
 from paddle_fl.mpc.data_utils import aby3

 sample_reader = paddle.dataset.uci_housing.train()
@@ -45,10 +46,12 @@ def generate_encrypted_data():
    aby3.save_aby3_shares(encrypted_housing_labels, "/tmp/house_label")


-def load_decrypt_data(filepath, shape):
+def load_decrypt_data(filepath, shape, decrypted_file):
    """
    load the encrypted data and reconstruct
    """
+    if os.path.exists(decrypted_file):
+        os.remove(decrypted_file)
    part_readers = []
    for id in six.moves.range(3):
        part_readers.append(
@@ -59,4 +62,6 @@ def load_decrypt_data(filepath, shape):

    for instance in aby3_share_reader():
        p = aby3.reconstruct(np.array(instance))
-        print(p)
+        with open(decrypted_file, 'a+') as f:
+            for i in p:
+                f.write(str(i) + '\n')
--- a/python/paddle_fl/mpc/examples/uci_demo/run_standalone.sh
+++ b/python/paddle_fl/mpc/examples/uci_demo/run_standalone.sh
@@ -32,12 +32,13 @@
 #

 # modify the following vars according to your environment
-PYTHON="python"
-REDIS_HOME="path_to_redis_bin"
-SERVER="localhost"
-PORT=9937
+PYTHON=${PYTHON}
+REDIS_HOME=${PATH_TO_REDIS_BIN}
+SERVER=${LOCALHOST}
+PORT=${REDIS_PORT}
+echo "redis home in ${REDIS_HOME}, server is ${SERVER}, port is ${PORT}"

-function usage() {
+function usage(){
    echo 'run_standalone.sh SCRIPT_NAME [ARG...]'
    exit 0
 }
@@ -64,14 +65,25 @@ $REDIS_BIN -h $SERVER -p $PORT flushall
 # remove temp data generated in last time
 LOSS_FILE="/tmp/uci_loss.*"
 PRED_FILE="/tmp/uci_prediction.*"
-if [ "$LOSS_FILE" ]; then
+ls ${LOSS_FILE}
+if [ $? -eq 0 ]; then
        rm -rf $LOSS_FILE
 fi

-if [ "$PRED_FILE" ]; then
+ls ${PRED_FILE}
+if [ $? -eq 0 ]; then
        rm -rf $PRED_FILE
 fi

+TRAINING_FILE="/tmp/house_feature.part*"
+ls ${TRAINING_FILE}
+if [ $? -ne 0 ]; then
+    echo "There is no data in /tmp, please prepare data with "python prepare.py" firstly"
+    exit 1
+else
+    echo "There are data for uci:"
+    echo "`ls ${TRAINING_FILE}`"
+fi

 # kick off script with roles of 1 and 2, and redirect output to /dev/null
 for role in {1..2}; do

--- a/python/paddle_fl/mpc/examples/uci_demo/uci_housing_demo.py
+++ b/python/paddle_fl/mpc/examples/uci_demo/uci_housing_demo.py
@@ -61,8 +61,8 @@ exe = fluid.Executor(place)
 exe.run(fluid.default_startup_program())
 epoch_num = 20

-start_time = time.time()
 for epoch_id in range(epoch_num):
+    start_time = time.time()
    step = 0

    # Method 1: feed data directly 
@@ -71,17 +71,18 @@ for epoch_id in range(epoch_num):

    # Method 2: feed data via loader
    for sample in loader():
+        step_start = time.time()
        mpc_loss = exe.run(feed=sample, fetch_list=[avg_loss])
+        step_end = time.time()

        if step % 50 == 0:
-            print('Epoch={}, Step={}, Loss={}'.format(epoch_id, step,
-                                                      mpc_loss))
+            print('Epoch={}, Step={}, batch_cost={:.4f} s, Loss={},'.format(epoch_id, step,
+                                                      (step_end - step_start), mpc_loss))
            with open(loss_file, 'ab') as f:
                f.write(np.array(mpc_loss).tostring())
        step += 1
-
-end_time = time.time()
-print('Mpc Training of Epoch={} Batch_size={}, cost time in seconds:{}'
+    end_time = time.time()
+    print('Mpc Training of Epoch={} Batch_size={}, epoch_cost={:.4f} s'
      .format(epoch_num, BATCH_SIZE, (end_time - start_time)))

 prediction_file = "/tmp/uci_prediction.part{}".format(role)
@@ -92,9 +93,3 @@ for sample in loader():
    with open(prediction_file, 'ab') as f:
        f.write(np.array(prediction).tostring())
    break
-
-import process_data
-print("uci_loss:")
-process_data.load_decrypt_data("/tmp/uci_loss", (1, ))
-print("prediction:")
-process_data.load_decrypt_data("/tmp/uci_prediction", (BATCH_SIZE, ))
--- a/python/paddle_fl/mpc/initializer.py
+++ b/python/paddle_fl/mpc/initializer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MPC('int64') Initializer
+"""
+import numpy as np
+
+import mpc_data_utils as mdu
+from paddle.fluid.initializer import Initializer
+import paddle.fluid.framework as framework
+from paddle.fluid.core import VarDesc
+from paddle.fluid import unique_name
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+
+class NumpyArrayInitializer(Initializer):
+    """Init a mpc parameter with an numpy array (astype('int64'))
+    This op initialize the variable by numpy array.
+
+    Args:
+        value (numpy): numpy array to initialize the variable
+
+    Returns:
+        A Tensor variable initialized by numpy.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle_fl.mpc as pfl
+            import numpy
+            weight_share = numpy.array([1,2]).astype('int64')
+            w_param_attrs = fluid.ParamAttr(name='emb_weight',
+                                        learning_rate=0.5,
+                                        initializer=pfl_mpc.initializer.NumpyArrayInitializer(weight_share),
+                                        trainable=True)
+    """
+
+    def __init__(self, value):
+        import numpy
+        assert isinstance(value, numpy.ndarray)
+        super(NumpyArrayInitializer, self).__init__()
+        self._value = value
+
+    def __call__(self, var, block):
+        """Add constant initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+
+        out_var = var
+        out_dtype = var.dtype
+        np_value = self._value
+
+        value_name = "int64_values"
+        if (out_dtype != VarDesc.VarType.INT64):
+          raise ValueError("Only 'int64' dtype is supported in paddlefl's initializer, "
+                            "Use paddle.fluid.initializer for other dtype.")
+        values = [int(v) for v in np_value.flat]
+
+        if self._value.size > 1024 * 1024 * 1024:
+            raise ValueError("The size of input is too big. Please consider "
+                             "saving it to file and 'load_op' to load it")
+        op = block._prepend_op(
+            type='assign_value',
+            outputs={'Out': out_var},
+            attrs={
+                'dtype': out_dtype,
+                'shape': list(self._value.shape),
+                value_name: values
+            },
+            stop_gradient=True)
+
+        if not framework.in_dygraph_mode():
+            var.op = op
+        return op
+
+
+
+class XavierInitializer(Initializer):
+    """
+    This class implements the Xavier weight initializer from the paper
+    `Understanding the difficulty of training deep feedforward neural
+    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+    by Xavier Glorot and Yoshua Bengio.
+    This initializer is designed to keep the scale of the gradients
+    approximately same in all the layers. In case of Uniform distribution,
+    the range is [-x, x], where
+    .. math::
+        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+    .. math::
+        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
+    Args:
+        uniform (bool,default True): whether to use uniform ,if False use normal distribution
+        fan_in (float,default None): fan_in for Xavier initialization. If None, it is
+                inferred from the variable.
+        fan_out (float,default None): fan_out for Xavier initialization. If None, it is
+                 inferred from the variable.
+        seed (int): random seed
+    Note:
+        It is recommended to set fan_in and fan_out to None for most cases.
+        Share of the distribution will be returned.
+        The seeds of three parties should be same.
+    Examples:
+        .. code-block:: python
+            import paddle_fl.mpc as pfl_mpc
+            queries = pfl_mpc.data(name='x', shape=[2,1], dtype='int64')
+            fc = pfl_mpc.layers.fc(
+                input=queries, size=10,
+                param_attr=pfl_mpc.initializer.Xavier(uniform=False))
+    """
+
+    def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
+        assert uniform is not None
+        assert seed is not None
+        super(XavierInitializer, self).__init__()
+        self._uniform = uniform
+        self._fan_in = fan_in
+        self._fan_out = fan_out
+        self._seed = seed
+
+    def _compute_fans(self, var):
+        """Compute the fan_in and the fan_out for layers
+        This method computes the fan_in and the fan_out
+        for neural network layers, if not specified. It is
+        not possible to perfectly estimate fan_in and fan_out.
+        This method will estimate it correctly for matrix multiply and
+        convolutions.
+        Args:
+            var: variable for which fan_in and fan_out have to be computed
+        Returns:
+            tuple of two integers (fan_in, fan_out)
+        """
+        shape = var.shape
+        if not shape or len(shape) == 0:
+            raise ValueError("Shape should be larger than 0 in paddlefl's initializer.")
+        elif len(shape) == 1:
+            fan_in = fan_out = 1
+        elif len(shape) == 2:
+            fan_in = fan_out = shape[1]
+        elif len(shape) == 3:
+            # This is the case for simple matrix multiply
+            fan_in = shape[1]
+            fan_out = shape[2]
+        else:
+            # Assume this to be a convolutional kernel
+            # In PaddlePaddle, the shape of the kernel is like:
+            # [num_filters, num_filter_channels, ...] where the remaining
+            # dimensions are the filter_size
+            receptive_field_size = np.prod(shape[3:])
+            fan_in = shape[2] * receptive_field_size
+            fan_out = shape[1] * receptive_field_size
+
+        return (fan_in, fan_out)
+
+    def __call__(self, var, block):
+        """Add xavier initialization ops for a variable
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+        Returns:
+            the initialization op
+        """
+        assert isinstance(block, framework.Block)
+        check_variable_and_dtype(var, "Out", ["int64"], "xavier_init")
+
+        if (var.dtype != VarDesc.VarType.INT64):
+            raise ValueError("Only 'int64' dtype is supported in paddlefl's initializer.")
+
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in and fan_out are passed, use them
+        fan_in = f_in if self._fan_in is None else self._fan_in
+        fan_out = f_out if self._fan_out is None else self._fan_out
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # create tmp var:
+        # out_var for random number, shape = (1, ...)
+        # out_expand_var for encrypted random number, shape = (2, ...), is same with var's shape
+        out_dtype = VarDesc.VarType.FP32
+        shape_ = list(var.shape)
+        shape_[0]=1
+        out_var = block.create_var(
+            name=unique_name.generate(".".join(
+                ['gaussian_random', var.name, 'tmp'])),
+            shape=shape_,
+            dtype=out_dtype,
+            type=VarDesc.VarType.LOD_TENSOR,
+            persistable=False)
+
+        out_expand_var = block.create_var(
+            name=unique_name.generate(".".join(
+                ['gaussian_random_expand', var.name, 'tmp'])),
+            shape=out_var.shape,
+            dtype=out_dtype,
+            type=VarDesc.VarType.LOD_TENSOR,
+            persistable=False)
+
+        if self._uniform:
+            limit = np.sqrt(6.0 / float(fan_in + fan_out))
+            op = block._prepend_op(
+                type="uniform_random",
+                inputs={},
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": out_var.shape,
+                    "dtype": out_dtype,
+                    "min": -limit,
+                    "max": limit,
+                    "seed": self._seed
+                },
+                stop_gradient=True)
+        else:
+            std = np.sqrt(2.0 / float(fan_in + fan_out))
+            op = block._prepend_op(
+                type="gaussian_random",
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": out_var.shape,
+                    "dtype": out_dtype,
+                    "mean": 0.0,
+                    "std": std,
+                    "seed": self._seed
+                },
+                stop_gradient=True)
+
+        # convert plaintext into cyphertext
+        block.append_op(
+            type="scale",
+            inputs={"X": out_var},
+            outputs={"Out": out_var},
+            attrs={"scale": float(mdu.mpc_one_share)})
+
+        # extend one share to two share
+        block.append_op(
+            type="concat",
+            inputs={"X": [out_var, out_var]},
+            outputs={"Out": [out_expand_var]},
+            attrs={"axis": 0})
+
+        # cast float into int64
+        block.append_op(
+            type="cast",
+            inputs={"X": out_expand_var},
+            outputs={"Out": var},
+            attrs={"in_dtype": out_expand_var.dtype,
+                   "out_dtype": var.dtype})
+
+        if not framework.in_dygraph_mode():
+            var.op = op
+        return op
+
--- a/python/paddle_fl/mpc/input.py
+++ b/python/paddle_fl/mpc/input.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module provides embedding operation for paddle_mpc.
+"""
+
+from __future__ import print_function
+import six
+import numpy as np
+from paddle import fluid
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+
+import warnings
+from .framework import MpcVariable
+from .mpc_layer_helper import MpcLayerHelper
+from .data_utils import aby3
+
+__all__ = ['embedding']
+
+def embedding(input,
+              size,
+              is_sparse=False,
+              is_distributed=False,
+              padding_idx=None,
+              param_attr=None,
+              dtype='int64'):
+    """
+    The operator is used to lookup embeddings vector of ids provided by :attr:`input` . 
+    It automatically constructs a 2D embedding matrix based on the
+    input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` .
+    The `input` is the mpc one-hot tensor of indexes, it last dimensions is equal to `emb_size`,
+    its shape size must be 3, i.e., (2, x, emb_size)
+
+    The shape of output Tensor is generated by replacing an emb_size dimension to the
+    last dimension of the input Tensor shape.
+
+    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` , 
+    otherwise the program will throw an exception and exit.
+    ** Params of `is_sparse`, `is_distributed`, `padding_idx` have not been implemented.
+
+    .. code-block:: text
+
+        Case 1:
+
+        input is a Tensor.
+            input.data = aby3.make_share([[1, 0, 0], [0, 1, 0]])
+            input.shape = [2, 2, 3]
+            w.data = aby3.make_share([[1, 2], [2, 3], [3, 4]])
+        Given size = [2, 3, 2]
+        output is a Tensor:
+            out.shape = [2, 2, 2]
+            out.data.reconstruct = [[1, 2], [2, 3]]
+
+    Args:
+        input(MpcVariable): A Tensor or LoDTensor with type int64, which contains the id information.
+            The value of the input id should satisfy :math:`0<= id < size[0]` .
+        size(tuple|list): The shape of lookup table parameter. It should have two elements which
+            indicates the size of the dictionary of embeddings and the size of each embedding vector respectively.
+        is_sparse(bool, not implemented): The flag indicating whether to use sparse update. This parameter only
+            affects the performance of the backwards gradient update. It is recommended to set 
+            True because sparse update is faster. But some optimizer does not support sparse update,
+            such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , :ref:`api_fluid_optimizer_AdamaxOptimizer` , 
+            :ref:`api_fluid_optimizer_DecayedAdagradOptimizer` , :ref:`api_fluid_optimizer_FtrlOptimizer` ,
+            :ref:`api_fluid_optimizer_LambOptimizer` and :ref:`api_fluid_optimizer_LarsMomentumOptimizer` .
+            In these case, is_sparse must be False. Default: False.
+        is_distributed(bool, not implemented): Whether to store the embedding matrix in a distributed manner. Only used
+            in multi-machine distributed CPU training. Default: False.
+        padding_idx(int|long|None, not implemented): padding_idx needs to be in the interval [-vocab_size, vocab_size). 
+            If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
+            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
+            encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
+            If set None, it makes no effect to output. Default: None.
+        param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
+            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
+            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. 
+            The local word vector needs to be transformed into numpy format, and the shape of local word
+            vector shoud be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
+            is used to load custom or pre-trained word vectors.
+        dtype(str|core.VarDesc.VarType.INT64): It refers to the data type of output Tensor.
+            It must be int64.
+
+    Returns:
+        Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          import paddle_fl.mpc as pfl
+          import numpy as np
+          # data should be mpc one hot tensor
+          data = pfl.data(name='x', shape=[4, 3], dtype='int64')
+
+          # exampel 1
+          emb_1 = fluid.embedding(input=data, size=[3, 4])
+
+          # example 2: load custom or pre-trained word vectors
+          weight_data = np.random.random(size=(2, 3, 4))  # mpc word vectors with numpy format
+          w_param_attrs = fluid.ParamAttr(
+              name="emb_weight",
+              learning_rate=0.5,
+              initializer=fluid.initializer.NumpyArrayInitializer(weight_data),
+              trainable=True)
+          emb_2 = fluid.embedding(input=data, size=(3, 4), param_attr=w_param_attrs, dtype='int64')
+    """
+
+    if is_sparse:
+      warnings.warn("the process on sparse data is the same with dense data,"
+                   " this is, 'is_sparse' always set as 'False' in paddle_encrypted.")
+    if is_distributed:
+      warnings.warn("distributed deployment of paddle_encrypted has not been implemented."
+                   " this is, 'is_distributed' always set as 'False' in paddle_encrypted.")
+    if padding_idx:
+      warnings.warn("padding_idx is not supported in paddle_encrypted."
+                   " this is, 'padding_idx' always set as 'None' in paddle_encrypted.")
+    helper = MpcLayerHelper('embedding', **locals())
+    check_variable_and_dtype(input, 'input', ['int64'], 'paddle_encrypted.embedding')
+    check_dtype(dtype, 'dtype', ['int64'],
+                 'paddle_encrypted.embedding')
+
+    w = helper.create_mpc_parameter(
+        attr=helper.param_attr, shape=size, dtype='int64', is_bias=False)
+
+    tmp = helper.create_mpc_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='mpc_lookup_table_v2',
+        inputs={'Ids': input,
+                'W': w},
+        outputs={'Out': tmp},
+        attrs={
+            'is_sparse': False,
+            'is_distributed': False,
+            'remote_prefetch': False,
+            'padding_idx': None
+        })
+    return tmp
--- a/python/paddle_fl/mpc/layers/__init__.py
+++ b/python/paddle_fl/mpc/layers/__init__.py
@@ -30,6 +30,10 @@ from . import ml
 from .ml import *
 from . import compare
 from .compare import *
+from . import conv
+from .conv import conv2d
+from . import rnn
+from .rnn import *

 __all__ = []
 __all__ += basic.__all__
@@ -37,3 +41,4 @@ __all__ += math.__all__
 __all__ += matrix.__all__
 __all__ += ml.__all__
 __all__ += compare.__all__
+__all__ += conv.__all__
--- a/python/paddle_fl/mpc/layers/conv.py
+++ b/python/paddle_fl/mpc/layers/conv.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+All layers just related to the neural network.
+"""
+from __future__ import print_function
+
+import os
+import inspect
+import warnings
+import itertools
+
+import numpy as np
+import six
+
+import paddle
+from ..mpc_layer_helper import MpcLayerHelper
+from ..framework import MpcVariable, check_mpc_variable_and_dtype
+from functools import reduce
+import paddle
+
+__all__ = [
+    'conv2d',
+]
+
+def _convert_to_list(value, n, name, dtype):
+    """
+    Converts a single numerical type or iterable of numerical
+    types into an numerical type list.
+
+    Arguments:
+      value: The value to validate and convert. Could an int, or any iterable
+        of ints.
+      n: The size of the list to be returned.
+      name: The name of the argument being validated, e.g. "stride" or
+        "filter_size". This is only used to format error messages.
+      dtype: the numerical type of the element of the list to be returned.
+
+    Returns:
+      A list of n dtypes.
+
+    Raises:
+      ValueError: If something else than an int/long or iterable thereof was
+        passed.
+    """
+    if isinstance(value, dtype):
+        return [value, ] * n
+    else:
+        try:
+            value_list = list(value)
+        except TypeError:
+            raise ValueError("The " + name +
+                             "'s type must be list or tuple. Received: " + str(
+                                 value))
+        if len(value_list) != n:
+            raise ValueError("The " + name + "'s length must be " + str(n) +
+                             ". Received: " + str(value))
+        for single_value in value_list:
+            try:
+                dtype(single_value)
+            except (ValueError, TypeError):
+                raise ValueError(
+                    "The " + name + "'s type must be a list or tuple of " + str(
+                        n) + " " + str(dtype) + " . Received: " + str(
+                            value) + " "
+                    "including element " + str(single_value) + " of type" + " "
+                    + str(type(single_value)))
+        return value_list
+
+
+def _is_symmetric_padding(padding, data_dim):
+    """
+    Check whether padding is symmetrical.
+    """
+    assert len(padding) == data_dim * 2 or len(padding) == data_dim
+    is_sys = True
+    if len(padding) == data_dim * 2:
+        for i in range(data_dim):
+            if padding[i * 2] != padding[i * 2 + 1]:
+                is_sys = False
+    return is_sys
+
+
+def conv2d(input,
+           num_filters,
+           filter_size,
+           stride=1,
+           padding=0,
+           dilation=1,
+           groups=None,
+           param_attr=None,
+           bias_attr=None,
+           act=None,
+           name=None,
+           data_format="NCHW"):
+    """
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW or NHWC format, where N is batch size, C is the number of
+    channels, H is the height of the feature, and W is the width of the feature.
+    Filter is in MCHW format, where M is the number of output image channels,
+    C is the number of input image channels, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input image channels divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more details.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a tensor with NCHW or NHWC format.
+    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        input (Variable): The input is 4-D Tensor with shape [N, C, H, W], the data type
+            of input is float16 or float32 or float64.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple): The filter size. If filter_size
+            is a tuple, it must contain two integers, (filter_size_height,
+            filter_size_width). Otherwise, filter_size_height = filter_size_width =\
+            filter_size.
+        stride (int|tuple): The stride size. It means the stride in convolution.
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width).
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
+        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
+            on both sides for each dimension.If `padding` is a string, either 'VALID' or
+            'SAME' which is the padding algorithm. If padding size is a tuple or list,
+            it could be in three forms: `[pad_height, pad_width]` or
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when
+            `data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0],
+            [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
+            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
+            Default: padding = 0.
+        dilation (int|tuple): The dilation size. It means the spacing between the kernel
+            points. If dilation is a tuple, it must contain two integers, (dilation_height,
+            dilation_width). Otherwise, dilation_height = dilation_width = dilation.
+            Default: dilation = 1.
+        groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None
+        name(str|None): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
+           None by default.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
+
+    Returns:
+        A Variable holding Tensor representing the conv2d, whose data type is the
+        same with input. If act is None, the tensor variable storing the convolution
+        result, and if act is not None, the tensor variable storing convolution
+        and non-linearity activation result.
+
+    Raises:
+        ValueError: If using "depthwise_conv2d" (which is not supported yet).
+        ValueError: If `data_format` is not "NCHW" or "NHWC".
+        ValueError: If the channel dimmention of the input is less than or equal to zero.
+        ValueError: If `padding` is a string, but not "SAME" or "VALID".
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0
+            or the element corresponding to the input's channel is not 0.
+        ShapeError: If the input is not 4-D Tensor.
+        ShapeError: If the input's dimension size and filter's dimension size not equal.
+        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
+        ShapeError: If the number of input channels is not equal to filter's channels * groups.
+        ShapeError: If the number of output channels is not be divided by groups.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
+    """
+
+    check_mpc_variable_and_dtype(input, 'input', ['int64'],
+            'conv2d')
+    num_channels = input.shape[1 + 1]
+    use_cudnn = False
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("Attr(use_cudnn) should be True or False. Received "
+                         "Attr(use_cudnn): %s. " % str(use_cudnn))
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    channel_last = (data_format == "NHWC")
+    num_channels = input.shape[3 + 1] if channel_last else input.shape[1 + 1]
+    if num_channels < 0:
+        raise ValueError(
+            "The channel dimmention of the input(%s) should be defined. "
+            "Received: %s." % (str(input.shape), str(num_channels)))
+    assert param_attr is not False, "param_attr should not be False here."
+
+    l_type = 'conv2d'
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            not use_cudnn):
+        l_type = 'depthwise_conv2d'
+        raise ValueError("l_type"
+                "%s is not implemented yet. " % (str(l_type)))
+
+    helper = MpcLayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+
+    if groups is None:
+        num_filter_channels = num_channels
+    else:
+        if num_channels % groups != 0:
+            raise ValueError(
+                "the channel of input must be divisible by groups,"
+                "received: the channel of input is {}, the shape of input is {}"
+                ", the groups is {}".format(num_channels, input.shape, groups))
+        num_filter_channels = num_channels // groups
+
+    filter_size = _convert_to_list(filter_size, 2, 'filter_size', np.int)
+    stride = _convert_to_list(stride, 2, 'stride', np.int)
+    dilation = _convert_to_list(dilation, 2, 'dilation', np.int)
+
+    # padding
+    def _update_padding(padding, data_format):
+        """ update padding accroding to data_format
+        raise ValueError if padding is not supported
+        """
+        def is_list_or_tuple(ele):
+            """ return True if ele is a list or tuple
+            """
+            if isinstance(ele, list) or isinstance(ele, tuple):
+                return True
+            return False
+
+        if is_list_or_tuple(padding) and len(padding) == 4:
+            if is_list_or_tuple(padding[0]) and (data_format == "NCHW"):
+                if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                    raise ValueError(
+                        "Non-zero padding(%s) in the batch or channel dimensions "
+                        "is not supported." % str(padding))
+                padding = padding[2:4]
+                padding = list(itertools.chain(*padding))
+            elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
+                if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
+                    raise ValueError(
+                        "Non-zero padding(%s) in the batch or channel dimensions "
+                        "is not supported." % str(padding))
+                padding = padding[1:3]
+                padding = list(itertools.chain(*padding))
+            padding = _convert_to_list(padding, 4, 'padding', np.int)
+            if _is_symmetric_padding(padding, 2):
+                padding = [padding[0], padding[2]]
+
+        else:
+            padding = _convert_to_list(padding, 2, 'padding', np.int)
+
+        return padding
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." %
+                str(padding))
+        if padding == "VALID":
+            padding_algorithm = "VALID"
+            padding = [0, 0]
+        elif padding == "SAME":
+            padding_algorithm = "SAME"
+            padding = [0, 0]
+
+    padding = _update_padding(padding, data_format)
+
+    filter_shape = [num_filters, int(num_filter_channels)] + filter_size
+
+    filter_param = helper.create_mpc_parameter(
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype)
+
+    pre_bias = helper.create_mpc_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type= 'mpc_' + l_type,
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': False,
+            'fuse_relu_before_depthwise_conv': False,
+            "padding_algorithm": padding_algorithm,
+            "data_format": data_format,
+        })
+
+    if data_format == 'NCHW':
+        pre_act = helper.append_mpc_bias_op(pre_bias, dim_start=1, dim_end=2 + 1)
+    else:
+        pre_act = helper.append_mpc_bias_op(pre_bias, dim_start=3, dim_end=4 + 1)
+
+    return helper.append_mpc_activation(pre_act)
--- a/python/paddle_fl/mpc/layers/ml.py
+++ b/python/paddle_fl/mpc/layers/ml.py
@@ -14,10 +14,15 @@
 """
 mpc ml op layers.
 """
+import os
+import numpy
 from functools import reduce
-
+import mpc_data_utils as mdu
 from paddle.fluid.data_feeder import check_type, check_dtype
-import numpy
+import paddle.fluid.layers.utils as utils
+from paddle.fluid.initializer import Constant
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.framework import Variable
 from ..framework import MpcVariable
 from ..framework import check_mpc_variable_and_dtype
 from ..mpc_layer_helper import MpcLayerHelper
@@ -27,6 +32,9 @@ __all__ = [
    'relu',
    'softmax',
    'sigmoid_cross_entropy_with_logits',
+    'softmax_with_cross_entropy',
+    'pool2d',
+    'batch_norm',
 ]


@@ -221,10 +229,14 @@ def relu(input, name=None):
    helper = MpcLayerHelper('relu', **locals())
    dtype = helper.input_dtype(input_param_name='input')
    out = helper.create_mpc_variable_for_type_inference(dtype)
+    derivative = helper.create_mpc_variable_for_type_inference(dtype)
    helper.append_op(
        type="mpc_relu",
        inputs={"X": input},
-        outputs={"Y": out})
+        outputs={
+            "Out": out,
+            "Derivative": derivative}
+        )
    return out


@@ -259,3 +271,282 @@ def sigmoid_cross_entropy_with_logits(x,
                "Label": label},
        outputs={"Out": out})
    return out
+
+
+def softmax_with_cross_entropy(logits,
+                               label,
+                               soft_label=False,
+                               return_softmax=False,
+                               axis=-1,
+                               use_relu=False,
+                               use_long_div=True):
+    """
+    forward: out = softmax(x). todo: add cross_entropy
+    backward: dx = dout.expand * (softmax(x) - label)
+
+    use_relu: False(default): output = exp(x_i) / sum(exp(x_i))
+              True: output = relu(x_i) / sum(relu(x_i))
+    use_long_div: True(default): long division implemented by boolean circuit.
+                                 slow with high precision.
+                                 range of quotient: [0, 2^20).
+                  False: find inverse of divisor by Newton's method.
+                         fast with low precision.
+                         range of divisor: (0, 2^15).
+    """
+
+    attrs = {
+        'soft_label': soft_label,
+        'axis': axis,
+        'use_relu': use_relu,
+        'use_long_div': use_long_div
+    }
+
+    helper = MpcLayerHelper('softmax_with_cross_entropy', **locals())
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    helper.append_op(
+        type='mpc_softmax_with_cross_entropy',
+        inputs={'Logits': logits,
+                'Label': label},
+        outputs={'Softmax': softmax,
+                 'Loss': loss},
+        attrs=attrs)
+    if return_softmax:
+        return loss, softmax
+    else:
+        raise NotImplementedError("'return_softmax' should be true. Loss is NULL, only for backward.")
+
+
+def pool2d(input,
+           pool_size=-1,
+           pool_type="max",
+           pool_stride=1,
+           pool_padding=0,
+           global_pooling=False,
+           ceil_mode=False,
+           name=None,
+           exclusive=True,
+           data_format="NCHW"):
+    """
+    pool2d
+    """
+    if pool_type not in ["max"]:
+        raise ValueError(
+            "Unknown Attr(pool_type): '%s'. It can only be 'max'.",
+            str(pool_type))
+
+    if global_pooling is False and pool_size == -1:
+        raise ValueError(
+            "When Attr(global_pooling) is False, Attr(pool_size) must be passed "
+            "and be a valid value. Received pool_size: %s." % str(pool_size))
+
+    if data_format not in ["NCHW"]:
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW'. Received "
+            "Attr(data_format): %s." % str(data_format))
+
+    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
+    pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
+
+    def update_padding(padding, data_format):
+        """
+        update_padding: convert to 2-dimension padding
+        """
+        def is_list_or_tuple(ele):
+            """
+            return true if ele is list or tuple.
+            """
+            if isinstance(ele, list) or isinstance(ele, tuple):
+                return True
+            return False
+
+        # covert padding size to 2 (H, W)
+        if is_list_or_tuple(padding) and len(padding) == 4:
+            if is_list_or_tuple(padding[0]):
+                if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
+                    raise ValueError(
+                        "Non-zero pool_padding(%s) in the batch or channel dimensions "
+                        "is not supported." % str(padding))
+                padding = padding[2:4] # data_format == "NCHW":
+                padding = [ele for a_list in padding for ele in a_list]
+            padding = utils.convert_to_list(padding, 4, 'padding')
+
+            if utils._is_symmetric_padding(padding, 2):
+                padding = [padding[0], padding[2]]
+        else:
+            padding = utils.convert_to_list(padding, 2, 'padding')
+
+        return padding
+
+    padding_algorithm = "EXPLICIT"
+    if isinstance(pool_padding, str):
+        pool_padding = pool_padding.upper()
+        if pool_padding not in ["SAME", "VALID"]:
+            raise ValueError(
+                "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
+                % str(pool_padding))
+        if pool_padding == "VALID":
+            padding_algorithm = "VALID"
+            pool_padding = [0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
+                    "Received ceil_mode: True.")
+        elif pool_padding == "SAME":
+            padding_algorithm = "SAME"
+            pool_padding = [0, 0]
+
+    pool_padding = update_padding(pool_padding, data_format) # [h, w]
+
+    op_type = 'pool2d'
+    helper = MpcLayerHelper(op_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_mpc_variable_for_type_inference(dtype)
+    one_hot_tensor = helper.create_variable_for_type_inference(dtype=input.dtype)
+
+    helper.append_op(
+        type='mpc_' + op_type,
+        inputs={"X": input},
+        outputs={"Out": pool_out,
+                 "One_hot_tensor": one_hot_tensor},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
+            "ceil_mode": ceil_mode,
+            "exclusive": exclusive,
+            "data_format": data_format,
+        })
+
+    return pool_out
+
+
+def batch_norm(input,
+               act=None,
+               is_test=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               data_layout='NCHW',
+               in_place=False,
+               name=None,
+               moving_mean_name=None,
+               moving_variance_name=None,
+               do_model_average_for_mean_and_var=True,
+               use_global_stats=False):
+    """
+    **Batch Normalization Layer**
+    """
+    assert bias_attr is not False, "bias_attr should not be False in batch_norm."
+    helper = MpcLayerHelper('batch_norm', **locals())
+
+    check_mpc_variable_and_dtype(input, 'input', ['int64'], 'batch_norm')
+    dtype = helper.input_dtype()
+
+    has_reserve_space = False
+    if data_layout == 'NHWC':
+        flag = os.environ.get('FLAGS_cudnn_batchnorm_spatial_persistent')
+        if flag is not None and flag.lower() in ['true', '1']:
+            has_reserve_space = True
+
+        # plaintext_dtype = core.VarDesc.VarType.FP32
+
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[2]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    param_shape = [channel_num]
+    mpc_param_shape = [2, channel_num]
+
+    # create parameter
+    scale = helper.create_mpc_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=Constant(mdu.mpc_one_share))
+    bias = helper.create_mpc_parameter(
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+
+    mean = helper.create_mpc_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name,
+            initializer=Constant(0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
+        shape=param_shape,
+        dtype=dtype)
+    mean.stop_gradient = True
+
+
+    variance = helper.create_mpc_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(mdu.mpc_one_share), # plaintext: 1
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
+        shape=param_shape,
+        dtype=dtype)
+    variance.stop_gradient = True
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_mpc_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_mpc_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+
+
+    #reserve_space = None
+    #if has_reserve_space:
+    #    reserve_space = helper.create_variable_for_type_inference(
+    #        dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
+
+    batch_norm_out = input if in_place else \
+            helper.create_mpc_variable_for_type_inference(dtype)
+
+    inputs = {
+        "X": input,
+        "Scale": scale,
+        "Bias": bias,
+        "Mean": mean,
+        "Variance": variance
+    }
+    attrs = {
+        "epsilon": epsilon,
+        "is_test": is_test,
+        "data_layout": data_layout,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats
+    }
+    if isinstance(momentum, Variable):
+        inputs['MomemtumTensor'] = momentum
+    else:
+        attrs['momentum'] = momentum
+
+    outputs = {
+        "Y": batch_norm_out,
+        "MeanOut": mean_out,
+        "VarianceOut": variance_out,
+        "SavedMean": saved_mean,
+        "SavedVariance": saved_variance
+    }
+    #if reserve_space is not None:
+    #    outputs["ReserveSpace"] = reserve_space
+
+    helper.append_op(
+        type="mpc_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return helper.append_activation(batch_norm_out)
--- a/python/paddle_fl/mpc/layers/rnn.py
+++ b/python/paddle_fl/mpc/layers/rnn.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+mpc rnn op layers.
+"""
+
+from paddle.fluid.framework import in_dygraph_mode
+from ..mpc_layer_helper import MpcLayerHelper
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+
+def dynamic_gru(input,
+                size,
+                param_attr=None,
+                bias_attr=None,
+                is_reverse=False,
+                gate_activation='sigmoid',
+                candidate_activation='relu',
+                h_0=None,
+                origin_mode=False):
+    """
+    **Note: The input type of this must be LoDTensor. If the input type to be
+    processed is Tensor, use** :ref:`api_fluid_layers_StaticRNN` .
+
+    This operator is used to perform the calculations for a single layer of
+    Gated Recurrent Unit (GRU) on full sequences step by step. The calculations
+    in one time step support these two modes:
+
+    If ``origin_mode`` is True, then the formula used is from paper
+    `Learning Phrase Representations using RNN Encoder Decoder for Statistical
+    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_ .
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    if ``origin_mode`` is False, then the formula used is from paper
+    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
+    Modeling  <https://arxiv.org/pdf/1412.3555.pdf>`_
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
+
+    :math:`x_t` is the input of current time step, but it is not from ``input`` .
+    This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
+    **Note** thus a fully-connect layer whose size is 3 times of ``size`` should
+    be used before this operator, and the output should be used as ``input`` here.
+    :math:`h_{t-1}` is the hidden state from previous time step. 
+    :math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
+    update gate, reset gate, candidate hidden and hidden output separately.
+    :math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
+    the weight matrix and bias used in update gate, reset gate, candidate hidden
+    calculations. For implementation, the three weight matrix are merged into a
+    tensor shaped :math:`[D, D \\times 3]` , the three bias are concatenated as
+    a tensor shaped :math:`[1, D \\times 3]` , where :math:`D` stands for the
+    hidden size; The data layout of weight tensor is: :math:`W_{uh}` and :math:`W_{rh}`
+    are concatenated with shape :math:`[D, D  \\times 2]` lying on the first part,
+    and :math:`W_{ch}` lying on the latter part with shape :math:`[D, D]` .
+
+
+    Args:
+        input(Variable): A LoDTensor whose lod level is 1, representing the input
+            after linear projection. Its shape should be :math:`[T, 2, D \\times 3]` ,
+            which is transpose mpc input by axis {1, 0, 2}, and set mpc shares lod,
+            where :math:`T` stands for the total sequence lengths in this mini-batch,
+            :math:`D` for the hidden size. The data type should be int64.
+        size(int): Indicate the hidden size.
+        param_attr(ParamAttr, optional):  To specify the weight parameter property.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` .
+        bias_attr (ParamAttr, optional): To specify the bias parameter property.
+            Default: None, which means the default bias parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` .
+        is_reverse(bool, optional): Whether to compute in the reversed order of
+            input sequences. Default False.
+        gate_activation(str, optional): The activation function corresponding to
+            :math:`act_g` in the formula. Only 'sigmoid' is supported now.
+        candidate_activation(str, optional): The activation function corresponding to
+            :math:`act_c` in the formula. Only "relu" is supported now.
+        h_0 (Variable, optional): A Tensor representing the initial hidden state.
+            It not provided, the default initial hidden state is 0. The shape is
+            :math:`[2, N, D]` , where :math:`N` is the number of sequences in the
+            mini-batch, :math:`D` for the hidden size. The data type should be
+            same as ``input`` . Default None.
+
+    Returns:
+        Variable: A LoDTensor whose lod level is 1 and shape is :math:`[2, T, D]` , \
+            where :math:`T` stands for the total sequence lengths in this mini-batch \
+            :math:`D` for the hidden size. It represents GRU transformed sequence output, \
+            and has the same lod and data type with ``input`` .
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            dict_dim, emb_dim = 128, 64
+            data = fluid.data(name='sequence',
+                      shape=[None],
+                      dtype='int64',
+                      lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            hidden_dim = 512
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
+            hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
+    """
+
+    assert in_dygraph_mode(
+    ) is not True, "please use gru instead of dynamic_gru in dygraph mode!"
+
+    helper = MpcLayerHelper('mpc_gru', **locals())
+    dtype = helper.input_dtype()
+
+    check_variable_and_dtype(input, 'input', ['int64'], 'mpc_gru')
+    check_dtype(dtype, 'dtype', ['int64'], 'mpc_gru')
+
+    weight = helper.create_mpc_parameter(
+        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+    bias = helper.create_mpc_parameter(
+        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
+    batch_size = input.shape[0]
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    if h_0:
+        assert h_0.shape == (
+            2, batch_size, size
+        ), 'The shape of h0 should be(batch_size, %d)' % size
+        inputs['H0'] = h_0
+
+    hidden = helper.create_mpc_variable_for_type_inference(dtype)
+    batch_gate = helper.create_mpc_variable_for_type_inference(dtype)
+    batch_reset_hidden_prev = helper.create_mpc_variable_for_type_inference(dtype)
+    batch_hidden = helper.create_mpc_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type='mpc_gru',
+        inputs=inputs,
+        outputs={
+            'Hidden': hidden,
+            'BatchGate': batch_gate,
+            'BatchResetHiddenPrev': batch_reset_hidden_prev,
+            'BatchHidden': batch_hidden
+        },
+        attrs={
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'activation': candidate_activation,
+            'origin_mode': origin_mode
+        })
+    return hidden
--- a/python/paddle_fl/mpc/mpc_layer_helper.py
+++ b/python/paddle_fl/mpc/mpc_layer_helper.py
@@ -29,7 +29,7 @@ from paddle.fluid.initializer import ConstantInitializer

 # mpc_paddle module
 from .framework import MpcVariable, MpcParameter, create_mpc_parameter, create_mpc_var
-
+from .initializer import XavierInitializer

 class MpcLayerHelper(LayerHelper):
    """
@@ -100,7 +100,7 @@ class MpcLayerHelper(LayerHelper):
            if is_bias:
                attr._set_default_bias_initializer()
            else:
-                attr._set_default_initializer(ConstantInitializer(0))
+                attr._set_default_initializer(XavierInitializer(seed=65536))
        else:
            attr._set_default_initializer(default_initializer)

@@ -215,11 +215,14 @@ class MpcLayerHelper(LayerHelper):

        tmp = self.create_mpc_variable_for_type_inference(
            dtype=input_var.dtype)
+        derivative = self.create_mpc_variable_for_type_inference(
+            dtype=input_var.dtype)
        # add "mpc_" as prefix of mpc activation
        self.append_op(
            type="mpc_" + act_type,
            inputs={"X": [input_var]},
-            outputs={"Out": [tmp]},
+            outputs={"Out": [tmp],
+                     "Derivative": [derivative]},
            attrs=act)
        return tmp


--- a/python/paddle_fl/mpc/optimizer.py
+++ b/python/paddle_fl/mpc/optimizer.py
@@ -22,6 +22,8 @@ from paddle.fluid.framework import Variable
 from paddle.fluid.clip import error_clip_callback
 from paddle.fluid import unique_name
 from paddle.fluid.initializer import Constant
+from paddle import fluid
+from paddle.fluid import core
 from .backward import append_backward
 from .mpc_layer_helper import MpcLayerHelper

@@ -135,7 +137,7 @@ class MPCSGDOptimizer(Optimizer):
            name=unique_name.generate("learning_rate"),
            shape=[1],
            value=float(self._learning_rate),
-            dtype='double',
+            dtype='float32',
            persistable=True)

    def _create_param_lr(self, param_and_grad):
@@ -166,6 +168,240 @@ class MPCSGDOptimizer(Optimizer):
        return self._learning_rate_map.get(program, None)


+class MPCAdamOptimizer(Optimizer):
+    """
+    The Adam optimizer uses an optimization described at the end
+    of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
+    it can dynamically adjusts the learning rate of each parameter using
+    the 1st moment estimates and the 2nd moment estimates of the gradient.
+
+    The parameter ``param_out`` update rule with gradient ``grad``:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+
+        moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+
+        learning\_rate & = learning\_rate * \\
+                          \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+
+    Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
+
+    Args:
+        learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a ``Variable`` with a float type. The default value is 0.001.
+        beta1 (float|Variable, optional): The exponential decay rate for the 1st moment estimates.
+            It should be a float number or a Variable with shape [1] and data type as float32.
+            The default value is 0.9.
+        beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates.
+            It should be a float number or a Variable with shape [1] and data type as float32.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+            The default value is 1e-08.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+
+    """
+    _moment1_acc_str = "moment1"
+    _moment2_acc_str = "moment2"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+    _beta2_pow_acc_str = "beta2_pow_acc"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-4,
+                 name=None):
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        super(MPCAdamOptimizer, self).__init__(
+            learning_rate=learning_rate,
+            name=name)
+        self.type = "adam"
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+        self.type = "mpc_adam"
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            self._add_accumulator(self._moment1_acc_str, p)
+            self._add_accumulator(self._moment2_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                fill_value=0.9 if isinstance(self._beta1, Variable) \
+                        else self._beta1,
+                shape=[1],
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu', dtype='float32')
+            self._add_accumulator(
+                name=self._beta2_pow_acc_str,
+                param=p,
+                fill_value=0.999 if isinstance(self._beta2, Variable) \
+                        else self._beta2,
+                shape=[1],
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu', dtype='float32')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment1 = self._get_accumulator(self._moment1_acc_str,
+                                        param_and_grad[0])
+        moment2 = self._get_accumulator(self._moment2_acc_str,
+                                        param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
+        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                              param_and_grad[0])
+        lr = self._create_param_lr(param_and_grad)
+        # create the adam optimize op
+
+
+        inputs = {
+            "Param": [param_and_grad[0]],
+            "Grad": [param_and_grad[1]],
+            "LearningRate": [lr],
+            "Moment1": [moment1],
+            "Moment2": [moment2],
+            "Beta1Pow": [beta1_pow_acc],
+            "Beta2Pow": [beta2_pow_acc]
+        }
+        outputs = {
+            "ParamOut": [param_and_grad[0]],
+            "Moment1Out": [moment1],
+            "Moment2Out": [moment2],
+            "Beta1PowOut": [beta1_pow_acc],
+            "Beta2PowOut": [beta2_pow_acc],
+        }
+        attrs = {
+            "epsilon": self._epsilon,
+        }
+
+        if isinstance(self._beta1, Variable):
+            inputs['Beta1Tensor'] = self._beta1
+        else:
+            attrs['beta1'] = self._beta1
+        if isinstance(self._beta2, Variable):
+            inputs['Beta2Tensor'] = self._beta2
+        else:
+            attrs['beta2'] = self._beta2
+
+        adam_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True)
+
+        return adam_op
+
+    def _create_global_learning_rate(self):
+        lr = self._global_learning_rate()
+
+        if isinstance(lr, framework.Variable):
+            return
+        else:
+            if not isinstance(self._learning_rate, float):
+                raise TypeError(
+                    "learning rate variable is create outside optimizer,"
+                    "can not create new learning rate variable for new program")
+
+        # create learning rate in the current main program
+        self._learning_rate_map[framework.default_main_program(
+        )] = create_global_var(
+            name=unique_name.generate("learning_rate"),
+            shape=[1],
+            value=float(self._learning_rate),
+            dtype='float32',
+            persistable=True)
+
+    def _create_param_lr(self, param_and_grad):
+        """
+        create learning rate parameter
+        """
+        # create learning rate variable for every parameter
+        param = param_and_grad[0]
+        param_lr = param.optimize_attr['learning_rate']
+        if type(param_lr) == Variable:
+            return param_lr
+        else:
+            if param_lr == 1.0:
+                return self._global_learning_rate()
+            else:
+                with fluid.default_main_program()._lr_schedule_guard(
+                        is_with_opt=True), framework.name_scope(
+                            'scale_with_param_lr'):
+                    return self._global_learning_rate() * param_lr
+
+    def _global_learning_rate(self, program=None):
+        """
+        get global decayed learning rate
+        :return:
+        """
+        if program is None:
+            program = framework.default_main_program()
+        return self._learning_rate_map.get(program, None)
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        """
+        The first part of ``minimize``, do auto-diff to append backward operations for
+        the current program.
+        Args:
+            loss (Variable): ``loss`` variable to run optimizations.
+            startup_program (Program, optional): :ref:`api_fluid_Program` for
+                initializing parameters in ``parameter_list``. The default value
+                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
+            parameter_list (list, optional): List of ``Variable`` names to update
+                to minimize ``loss``. The default value is None, at this time all parameters
+                will be updated.
+            no_grad_set (set, optional): Set of ``Variable`` objects that don't need
+                to be updated. The default value is None.
+            callbacks (list, optional): list of callable objects to run when appending backward
+                operator for one parameter. The default value is None.
+        Return:
+            list: list of (param, grad) variable pairs, param is ``Parameter``,
+                grad is the gradient value corresponding to the parameter.
+        Examples:
+            See examples in ``apply_gradients``.
+        """
+        no_grad_set = self._get_no_grad_set(loss, no_grad_set)
+
+        self._dtype = loss.dtype
+
+        if callbacks is None:
+            callbacks = [error_clip_callback]
+        else:
+            assert (isinstance(callbacks, list))
+        program = loss.block.program
+        assert len(loss.shape) == 2 and loss.shape[0] == 2 and loss.shape[1] == 1, \
+                "The loss.shape should be (2L,), but the current loss.shape is {}. " \
+                "Maybe that you should call fluid.layers.mean to process the current loss.".format(
+                    loss.shape)
+        with program_guard(program, startup_program):
+            params_grads = append_backward(loss, parameter_list, no_grad_set,
+                                           callbacks)
+            # Note: since we can't use all_reduce_op now,
+            #  dgc_op should be the last op of one grad.
+            self._append_dgc_ops(params_grads)
+        return params_grads
+
 def create_global_var(shape,
                      value,
                      dtype,
@@ -209,3 +445,4 @@ def create_global_var(shape,


 SGD = MPCSGDOptimizer
+Adam = MPCAdamOptimizer
--- a/python/paddle_fl/mpc/tests/unittests/op_test.py
+++ b/python/paddle_fl/mpc/tests/unittests/op_test.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import warnings
+import numpy as np
+import random
+import six
+import time
+import itertools
+import collections
+from collections import defaultdict
+from multiprocessing import Pipe, Process, Manager
+import os
+import traceback
+import unittest
+import redis
+
+import paddle_fl.mpc as pfl_mpc
+import paddle_fl.mpc.data_utils.aby3 as aby3
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle_fl.mpc.backward import append_backward
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+from paddle.fluid.framework import Program, OpProtoHolder, Variable
+from testsuite import create_op, set_input, append_input_output, append_loss_ops
+from paddle.fluid import unique_name
+
+import traceback
+
+def _set_use_system_allocator(value=None):
+    USE_SYSTEM_ALLOCATOR_FLAG = "FLAGS_use_system_allocator"
+    old_value = core.globals()[USE_SYSTEM_ALLOCATOR_FLAG]
+    value = old_value if value is None else value
+    core.globals()[USE_SYSTEM_ALLOCATOR_FLAG] = value
+    return old_value
+
+
+def randomize_probability(batch_size, class_num, dtype='float32'):
+    prob = np.random.uniform(
+        0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
+    prob_sum = prob.sum(axis=1)
+    for i in six.moves.xrange(len(prob)):
+        prob[i] /= prob_sum[i]
+    return prob
+
+def skip_check_grad_ci(reason=None):
+    """Decorator to skip check_grad CI.
+
+       Check_grad is required for Op test cases. However, there are some special
+       cases that do not need to do check_grad. This decorator is used to skip the
+       check_grad of the above cases.
+
+       Note: the execution of unit test will not be skipped. It just avoids check_grad
+       checking in tearDownClass method by setting a `no_need_check_grad` flag.
+
+       Example:
+           @skip_check_grad_ci(reason="For inference, check_grad is not required.")
+           class TestInference(OpTest):
+    """
+    if not isinstance(reason, str):
+        raise AssertionError("The reason for skipping check_grad is required.")
+
+    def wrapper(cls):
+        cls.no_need_check_grad = True
+        return cls
+
+    return wrapper
+
+class Aby3Process(Process):
+    """
+    Extends from Process, evaluate the computation party in aby3.
+    """
+    def __init__(self, *args, **kwargs):
+        Process.__init__(self, *args, **kwargs)
+        self._pconn, self._cconn = Pipe()
+        self._exception = None
+
+    def run(self):
+        """
+        Override. Send any exceptions raised in
+        subprocess to main process.
+        """
+        try:
+            Process.run(self)
+            self._cconn.send(None)
+        except Exception as e:
+            tb = traceback.format_exc()
+            self._cconn.send((e, tb))
+
+    @property
+    def exception(self):
+        """
+        Get exception.
+        """
+        if self._pconn.poll():
+            self._exception = self._pconn.recv()
+        return self._exception
+
+class OpTest(unittest.TestCase):
+    def __init__(self, methodName='runTest'):
+        super(OpTest, self).__init__(methodName)
+        # set redis server and port
+        self.server = os.environ['TEST_REDIS_IP']
+        self.port = os.environ['TEST_REDIS_PORT']
+        self.party_num = 3
+
+    def setUp(self):
+        """
+        Connect redis and delete all keys in all databases on the current host.
+        :return:
+        """
+        r = redis.Redis(host=self.server, port=int(self.port))
+        r.flushall()
+
+    def multi_party_run(self, **kwargs):
+        """
+        Run 3 parties with target function or other additional arguments.
+        :param kwargs:
+        :return:
+        """
+        r = redis.Redis(host=self.server, port=int(self.port))
+        r.flushall()
+
+        target = kwargs['target']
+
+        partys = []
+
+        for role in range(self.party_num):
+            kwargs.update({'role': role})
+            partys.append(Aby3Process(target=target, kwargs=kwargs))
+            partys[-1].start()
+        for party in partys:
+            party.join()
+            if party.exception:
+                return party.exception
+        return (True,)
+
+    @classmethod
+    def setUpClass(cls):
+        '''Fix random seeds to remove randomness from tests'''
+        cls._np_rand_state = np.random.get_state()
+        cls._py_rand_state = random.getstate()
+        cls.call_once = False
+        cls.dtype = None
+        cls.outputs = {}
+        cls.input_shape_is_large = True
+
+        np.random.seed(123)
+        random.seed(124)
+
+        cls._use_system_allocator = _set_use_system_allocator(True)
+
+    @classmethod
+    def tearDownClass(cls):
+        """Restore random seeds"""
+        np.random.set_state(cls._np_rand_state)
+        random.setstate(cls._py_rand_state)
+
+        _set_use_system_allocator(cls._use_system_allocator)
+
+        def is_empty_grad_op(op_type):
+            all_op_kernels = core._get_all_register_op_kernels()
+            grad_op = op_type + '_grad'
+            if grad_op in all_op_kernels.keys():
+                return False
+            return True
+
+        if not hasattr(cls, "op_type"):
+            raise AssertionError(
+                "This test do not have op_type in class attrs, "
+                "please set self.__class__.op_type=the_real_op_type manually.")
+
+        if not hasattr(cls, "no_need_check_grad") \
+            and not is_empty_grad_op(cls.op_type):
+
+            if not cls.input_shape_is_large and not hasattr(cls, "exist_check_grad"):
+                raise AssertionError(
+                    "Input's shape should be large than or equal to 100 for " +
+                    cls.op_type + " Op.")
+
+    def try_call_once(self, data_type):
+        if not self.call_once:
+            self.call_once = True
+            self.dtype = data_type
+
+    def infer_dtype_from_inputs_outputs(self, inputs, outputs):
+        def is_np_data(input):
+            return isinstance(input, (np.ndarray, np.generic))
+
+        def infer_dtype(numpy_dict, dtype_set):
+            assert isinstance(
+                numpy_dict,
+                dict), "self.inputs, self.outputs must be numpy_dict"
+            # the inputs are as follows:
+            # case 1: inputs = {'X': x}
+            # case 2: inputs = {'X': (x, x_lod)}
+            # case 3: inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+            # case 4: inputs = {'X': [("x1", (x1, [x1_lod1])), ("x2", (x2, [x2_.lod2]))]}
+            # TODO(juncaipeng) infer dtype from inputs maybe obtain wrong type.
+            for _, var_value in six.iteritems(numpy_dict):
+                if is_np_data(var_value):  # case 1
+                    dtype_set.add(var_value.dtype)
+                elif isinstance(var_value, (list, tuple)):  # case 2, 3, 4
+                    for sub_val_value in var_value:
+                        if is_np_data(sub_val_value):  # case 2
+                            dtype_set.add(sub_val_value.dtype)
+                        elif len(sub_val_value) > 1 and is_np_data(
+                                sub_val_value[1]):  # case 3
+                            dtype_set.add(sub_val_value[1].dtype)
+                        elif len(sub_val_value) > 1 and isinstance(sub_val_value[1], (list, tuple)) \
+                            and is_np_data(sub_val_value[1][0]): # case 4
+                            dtype_set.add(sub_val_value[1][0].dtype)
+
+        # infer dtype from inputs, and dtype means the precision of the test
+        # collect dtype of all inputs
+        dtype_set = set()
+        infer_dtype(inputs, dtype_set)
+        dtype_list = [
+            np.dtype(np.float64), np.dtype(np.float32), np.dtype(np.float16),
+            np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.int16),
+            np.dtype(np.int8), np.dtype(np.uint8), np.dtype(np.bool)
+        ]
+        # check the dtype in dtype_list in order, select the first dtype that in dtype_set
+        for dtype in dtype_list:
+            if dtype in dtype_set:
+                self.dtype = dtype
+                break
+        # save dtype in class attr
+        self.__class__.dtype = self.dtype
+
+    def feed_var(self, input_vars, place):
+        feed_map = {}
+        for var_name in input_vars:
+            if isinstance(input_vars[var_name], list):
+                for name, np_value in self.inputs[var_name]:
+                    tensor = core.LoDTensor()
+                    if isinstance(np_value, tuple):
+                        tensor.set(np_value[0], place)
+                        tensor.set_recursive_sequence_lengths(np_value[1])
+                    else:
+                        tensor.set(np_value, place)
+                    feed_map[name] = tensor
+            else:
+                tensor = core.LoDTensor()
+                if isinstance(self.inputs[var_name], tuple):
+                    tensor.set(self.inputs[var_name][0], place)
+                    tensor.set_recursive_sequence_lengths(self.inputs[var_name][
+                        1])
+                else:
+                    tensor.set(self.inputs[var_name], place)
+                feed_map[var_name] = tensor
+
+        return feed_map
+
+    def _append_ops(self, block):
+        self.__class__.op_type = self.op_type  # for ci check, please not delete it for now
+
+        op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
+        "infer datatype from inputs and outputs for this test case"
+        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
+        inputs = append_input_output(block, op_proto, self.inputs, True,
+                                     self.dtype)
+        outputs = append_input_output(block, op_proto, self.outputs, False,
+                                      self.dtype)
+
+        op = block.append_op(
+            type=self.op_type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=self.attrs if hasattr(self, "attrs") else dict())
+        # infer variable type and infer shape in compile-time
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        return op
+
+    def _get_io_vars(self, block, numpy_inputs):
+        inputs = {}
+        for name, value in six.iteritems(numpy_inputs):
+            if isinstance(value, list):
+                var_list = [
+                    block.var(sub_name) for sub_name, sub_value in value
+                ]
+                inputs[name] = var_list
+            else:
+                inputs[name] = block.var(name)
+        return inputs
+
+    def _get_inputs(self, block):
+        return self._get_io_vars(block, self.inputs)
+
+    def _get_outputs(self, block):
+        return self._get_io_vars(block, self.outputs)
+
+    def calc_output(self, place):
+        outs, _ = self._calc_output(place)
+        return outs
+
+    def _calc_output(self,
+                     place,
+                     parallel=False,
+                     no_check_set=None,
+                     loss=None,
+                     enable_inplace=None,
+                     for_inplace_test=False):
+        program = Program()
+        block = program.global_block()
+        op = self._append_ops(block)
+
+        inputs = self._get_inputs(block)
+        outputs = self._get_outputs(block)
+        feed_map = self.feed_var(inputs, place)
+
+        if for_inplace_test:
+            # Some variables' tensors hold no buffer (tensor's _holder is NULL), like XShape in reshape2 op,
+            # and the shapes of those variables contain 0 (eg. Xshape.shape = [0, 2, 5]).
+            # Set persistable for those variables in order to get them from global_scope for inplace grad test directly other than feed them,
+            # since feed op calls check_memory_size() which fails when tensor's holder_ is NULL.
+            for name in op.output_arg_names:
+                var = block.var(name)
+                var.persistable = True
+        original_program = program
+        #if parallel:
+        #    use_cuda = False
+        #    if isinstance(place, fluid.CUDAPlace):
+        #        use_cuda = True
+        #    compiled_prog = fluid.CompiledProgram(program).with_data_parallel(
+        #        loss_name=loss.name if loss else None, places=place)
+        #    program = compiled_prog
+        fetch_list = getattr(self, "fetch_list", [])
+        # if the fetch_list is customized by user, we use it directly.
+        # if not, fill the fetch_list by the user configured outputs in test.
+        if len(fetch_list) == 0:
+            for var_name, var in six.iteritems(outputs):
+                if no_check_set is not None and var_name in no_check_set:
+                    continue
+                if isinstance(var, list):
+                    for v in var:
+                        fetch_list.append(v.name)
+                else:
+                    fetch_list.append(var.name)
+        # if the fetch_list still empty, fill the fetch_list by the operator output.
+        if len(fetch_list) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                fetch_list.append(str(out_name))
+
+        if enable_inplace is not None:
+            build_strategy = fluid.BuildStrategy()
+            build_strategy.enable_inplace = enable_inplace
+
+            compiled_prog = fluid.CompiledProgram(program).with_data_parallel(
+                build_strategy=build_strategy, places=place)
+            program = compiled_prog
+
+        return_results = [Manager().list() for _ in range(len(fetch_list))]
+
+        def closure(**kwargs):
+            role = kwargs['role']
+
+            pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
+
+            #init_op = fluid.default_main_program().global_block().ops[0]
+
+            #_insert_init_op(program, init_op)
+
+            executor = Executor(place)
+
+            executor.run()
+            outs = executor.run(program,
+                            feed=feed_map,
+                            fetch_list=fetch_list)
+
+            for idx in range(len(fetch_list)):
+                return_results[idx].append(outs[idx])
+
+        ret = self.multi_party_run(target=closure)
+        self.assertEqual(ret[0], True)
+
+        outs = []
+
+        for idx in range(len(fetch_list)):
+            outs.append(aby3.reconstruct(np.array(return_results[idx])))
+
+        self.op = op
+        self.program = original_program
+        if for_inplace_test:
+            return outs, fetch_list, feed_map, original_program, op.desc
+        else:
+            return outs, fetch_list
+
+    def _get_need_run_ops(self, op_desc, fwd_op_desc=None):
+        """Postorder traversal of the 'grad' tree to get all ops that need to run during inplace test.
+        An op needs to run druing inplace check if,
+        (1) it has infer_inplace,
+        (2) it has infer_inplace in its grad descendants. (since we need its outputs as to construct its grad's inputs)
+
+        Args:
+            op_desc (OpDesc): The op_desc of current op.
+            fwd_op_desc (OpDesc): The op_desc of current op's forward op, None if current op has no forward op.
+                Eg. relu's fwd_op is None, relu_grad's fwd_op is relu, relu_grad_grad's fwd_op is relu_grad, etc.
+
+        Returns:
+            need_run_ops (list[(op_desc, fwd_op_desc)]): The ops that need to run during inplace test.
+        """
+        need_run_ops = []
+        visited_ops = []
+
+        def _dfs_grad_op(op_desc, fwd_op_desc=None):
+            visited_ops.append(op_desc.type())
+            has_infer_inplace = fluid.core.has_infer_inplace(op_desc.type())
+            has_grad_op_maker = fluid.core.has_grad_op_maker(op_desc.type())
+            has_infer_inplace_in_grad_descendants = False
+            if not has_grad_op_maker:
+                has_infer_inplace_in_descendants = False
+            else:
+                # get grad_op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    op_desc, set(), [])
+                if not grad_op_desc_list:
+                    has_infer_inplace_in_grad_descendants = False
+                else:
+                    for i, grad_op_desc in enumerate(grad_op_desc_list):
+                        if grad_op_desc.type(
+                        ) not in visited_ops and _dfs_grad_op(
+                                grad_op_desc, fwd_op_desc=op_desc):
+                            has_infer_inplace_in_grad_descendants = True
+            if has_infer_inplace or has_infer_inplace_in_grad_descendants:
+                need_run_ops.append((op_desc, fwd_op_desc))
+                return True
+            else:
+                return False
+
+        _dfs_grad_op(op_desc, fwd_op_desc=fwd_op_desc)
+        return need_run_ops
+
+    def check_inplace_output_with_place(self,
+                                        place,
+                                        no_check_set=None,
+                                        inplace_atol=None):
+        """Chech the inplace correctness of given op, its grad op, its grad_grad op, etc.
+
+        (1) Get all ops need to run. (see conditions in _get_need_run_ops())
+        (2) Run op in need_run_ops, and do inplace check if it has infer_inplace.
+
+        Args:
+            place (CPUPlace | CUDAPlace): The place where the op runs.
+            no_check_set (list): The names of outputs that needn't check, like XShape of reshape op.
+            inplace_atol (float): The tolerable error, only set when op doesn't ensure computational consistency, like group_norm op.
+
+        Returns:
+            None
+        """
+        has_infer_inplace = fluid.core.has_infer_inplace(self.op_type)
+        has_grad_op_maker = fluid.core.has_grad_op_maker(self.op_type)
+
+        fwd_res = self._calc_output(
+            place, no_check_set=no_check_set, for_inplace_test=True)
+        op_desc = fwd_res[4]
+        need_run_ops = self._get_need_run_ops(op_desc)
+
+        res = {}
+        for op_desc, father_op_desc in reversed(need_run_ops):
+            # The first one is the forward op
+            has_infer_inplace = fluid.core.has_infer_inplace(op_desc.type())
+            if op_desc.type() == self.op_type:
+                if has_infer_inplace:
+                    res[op_desc] = self._check_forward_inplace(
+                        place,
+                        no_check_set=no_check_set,
+                        inplace_atol=inplace_atol)
+                else:
+                    res[op_desc] = self._calc_output(
+                        place, no_check_set=no_check_set, for_inplace_test=True)
+            else:
+                if has_infer_inplace:
+                    fwd_res = res[father_op_desc]
+                    res[op_desc] = self._check_grad_inplace(
+                        place, fwd_res, op_desc, inplace_atol=inplace_atol)
+                else:
+                    res[op_desc] = self._calc_grad_output(place, fwd_res,
+                                                          op_desc)
+
+    def check_output_with_place(self,
+                                place,
+                                atol=0,
+                                no_check_set=None,
+                                equal_nan=False,
+                                check_dygraph=True,
+                                inplace_atol=None):
+        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
+
+        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
+        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+            if out_name not in self.outputs:
+                continue
+            if no_check_set is not None and out_name in no_check_set:
+                continue
+
+            def find_imperative_actual(target_name, dygraph_outs, place):
+                with fluid.dygraph.base.guard(place=place):
+                    for name in dygraph_outs:
+                        if name == target_name:
+                            return dygraph_outs[name][0]
+                        var_list = dygraph_outs[name]
+                        for i, var in enumerate(var_list):
+                            if var.name == target_name:
+                                return dygraph_outs[name][i]
+                    self.assertTrue(False, "Found failed {} {}".format(
+                        dygraph_outs.keys(), target_name))
+
+            def find_actual(target_name, fetch_list):
+                found = [
+                    i for i, var_name in enumerate(fetch_list)
+                    if var_name == target_name
+                ]
+                self.assertTrue(
+                    len(found) == 1, "Found {} {}".format(
+                        len(found), target_name))
+                return found[0]
+
+            if out_dup:
+                sub_out = self.outputs[out_name]
+                if not isinstance(sub_out, list):
+                    raise AssertionError("sub_out type %s is not list",
+                                         type(sub_out))
+                for item in sub_out:
+                    sub_out_name, expect = item[0], item[1]
+                    idx = find_actual(sub_out_name, fetch_list)
+                    actual = outs[idx]
+                    actual_t = np.array(actual)
+                    expect_t = expect[0] \
+                        if isinstance(expect, tuple) else expect
+                    self.assertTrue(
+                        np.allclose(
+                            actual_t, expect_t, atol=atol, equal_nan=equal_nan),
+                        "Output (" + sub_out_name + ") has diff at " +
+                        str(place))
+                    if isinstance(expect, tuple):
+                        self.assertListEqual(
+                            actual.recursive_sequence_lengths(), expect[1],
+                            "Output (" + sub_out_name +
+                            ") has different lod at " + str(place))
+            else:
+                idx = find_actual(out_name, fetch_list)
+                actual = outs[idx]
+                actual_t = np.array(actual)
+                expect = self.outputs[out_name]
+                expect_t = expect[0] if isinstance(expect, tuple) else expect
+                self.assertTrue(
+                    np.allclose(
+                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
+                    "Output (" + out_name + ") has diff at " + str(place) +
+                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
+                    str(actual_t) + " in class " + self.__class__.__name__)
+                if isinstance(expect, tuple):
+                    self.assertListEqual(actual.recursive_sequence_lengths(),
+                                         expect[1], "Output (" + out_name +
+                                         ") has different lod at " + str(place))
+
+        # Note(zhiqiu): inplace_atol should be only set when op doesn't ensure
+        # computational consistency.
+        # For example, group_norm uses AtomicAdd on CUDAPlace, which do not ensure
+        # computation order when multiple threads write the same address. So the
+        # result of group_norm is non-deterministic when datatype is float.
+        # When inplace_atol is not None, the inplace check uses numpy.allclose
+        # to check inplace result instead of numpy.array_equal.
+        if inplace_atol is not None:
+            warnings.warn(
+                "inplace_atol should only be set when op doesn't ensure computational consistency, please check it!"
+            )
+        # Check inplace for given op, its grad op, its grad_grad op, etc.
+        # No effect on original OpTest
+        self.check_inplace_output_with_place(
+            place, no_check_set=no_check_set, inplace_atol=inplace_atol)
+
+        return outs, fetch_list
+
+    def _assert_is_close(self, numeric_grads, analytic_grads, names,
+                         max_relative_error, msg_prefix):
+
+        for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
+            # It asserts np.abs(a - b) / np.abs(a) < max_relative_error, in which
+            # max_relative_error is 1e-7. According to the value of np.abs(a), we
+            # change np.abs(a) to achieve dynamic threshold. For example, if
+            # the value of np.abs(a) is between 1e-10 and 1e-8, we set np.abs(a)*=1e4.
+            # Therefore, it asserts np.abs(a - b) / (np.abs(a)*1e4) < max_relative_error,
+            # which is the same as np.abs(a - b) / np.abs(a) < max_relative_error*1e4.
+            abs_a = np.abs(a)
+            abs_a[abs_a < 1e-3] = 1
+
+            diff_mat = np.abs(a - b) / abs_a
+            max_diff = np.max(diff_mat)
+
+            def err_msg():
+                offset = np.argmax(diff_mat > max_relative_error)
+                return ("%s error, %s variable %s max gradient diff %f over limit %f, "
+                    "the first error element is %d, expected %f, but got %f.") \
+                    % (self.op_type, msg_prefix, name, max_diff, max_relative_error,
+                    offset, a.flatten()[offset], b.flatten()[offset])
+
+            self.assertLessEqual(max_diff, max_relative_error, err_msg())
+
+    def _check_grad_helper(self):
+        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
+        self.__class__.op_type = self.op_type
+        self.__class__.exist_check_grad = True
+
+    def check_grad_with_place(self,
+                              place,
+                              inputs_to_check,
+                              output_names,
+                              no_grad_set=None,
+                              numeric_grad_delta=0.005,
+                              in_place=False,
+                              max_relative_error=0.005,
+                              user_defined_grads=None,
+                              check_dygraph=True):
+        self.scope = core.Scope()
+        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
+        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
+        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
+
+        self._check_grad_helper()
+
+        cache_list = None
+        if hasattr(self, "cache_name_list"):
+            cache_list = self.cache_name_list
+        self.op = create_op(
+            self.scope,
+            self.op_type,
+            op_inputs,
+            op_outputs,
+            op_attrs,
+            cache_list=cache_list)
+
+        if no_grad_set is None:
+            no_grad_set = set()
+
+        for input_to_check in inputs_to_check:
+            set_input(self.scope, self.op, self.inputs, place)
+            tensor_to_check = self.scope.find_var(input_to_check).get_tensor()
+            tensor_size = six.moves.reduce(lambda a, b: a * b,
+                                           tensor_to_check.shape(), 1)
+            if tensor_size < 100:
+                self.__class__.input_shape_is_large = False
+
+        if not type(output_names) is list:
+            output_names = [output_names]
+
+        numeric_grads = user_defined_grads or [
+            self.get_numeric_gradient(
+                place,
+                self.scope,
+                self.op,
+                self.inputs,
+                input_to_check,
+                output_names,
+                delta=numeric_grad_delta,
+                in_place=in_place) for input_to_check in inputs_to_check
+        ]
+        analytic_grads = self._get_gradient(inputs_to_check, place,
+                                            output_names, no_grad_set)
+        self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
+                              max_relative_error,
+                              "Gradient Check On %s" % str(place))
+
+
+
+    @staticmethod
+    def _numpy_to_lod_tensor(np_value, lod, place):
+        tensor = core.LoDTensor()
+        tensor.set(np_value, place)
+        if lod is not None:
+            tensor.set_recursive_sequence_lengths(lod)
+        return tensor
+
+    @staticmethod
+    def np_dtype_to_fluid_dtype(input):
+        return input
+
+    @staticmethod
+    def fluid_dtype_to_np_dtype(self, dtype):
+        return dtype
+
+    @staticmethod
+    def np_value_to_fluid_value(input):
+        return input
+
+    def _get_gradient(self,
+                      input_to_check,
+                      place,
+                      output_names,
+                      no_grad_set,
+                      parallel=False):
+        prog = Program()
+        block = prog.global_block()
+        self._append_ops(block)
+        loss = append_loss_ops(block, output_names)
+        param_grad_list = append_backward(
+            loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
+
+        inputs = self._get_inputs(block)
+        feed_dict = self.feed_var(inputs, place)
+
+        fetch_list = [g for p, g in param_grad_list]
+
+        return_results = [Manager().list() for _ in range(len(fetch_list))]
+
+        def closure(**kwargs):
+            role = kwargs['role']
+
+            pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
+
+            #init_op = fluid.default_main_program().global_block().ops[0]
+
+            #_insert_init_op(program, init_op)
+
+            executor = Executor(place)
+
+            executor.run()
+            outs = executor.run(prog,
+                            feed=feed_dict,
+                            fetch_list=fetch_list)
+
+            for idx in range(len(fetch_list)):
+                return_results[idx].append(outs[idx])
+
+        ret = self.multi_party_run(target=closure)
+        self.assertEqual(ret[0], True)
+
+        outs = []
+
+        for idx in range(len(fetch_list)):
+            outs.append(aby3.reconstruct(np.array(return_results[idx])))
+        return outs
+
+    def get_numeric_gradient(self,
+                             place,
+                             scope,
+                             op,
+                             inputs,
+                             input_to_check,
+                             output_names,
+                             delta=0.005,
+                             in_place=False):
+        # FIXME: change this method by compile time concepts
+        set_input(scope, op, inputs, place)
+
+        def product(dim):
+            return six.moves.reduce(lambda a, b: a * b, dim, 1)
+
+        reveal = lambda x: (2**-16 * np.array(x))[0].astype('float32')
+
+        tensor_to_check = scope.find_var(input_to_check).get_tensor()
+        tensor_to_check = reveal(tensor_to_check)
+        tensor_to_check_  = fluid.LoDTensor()
+        tensor_to_check_.set(tensor_to_check, fluid.CPUPlace())
+        tensor_to_check = tensor_to_check_
+        tensor_size = product(tensor_to_check.shape())
+        tensor_to_check_dtype = tensor_to_check._dtype()
+        if tensor_to_check_dtype == core.VarDesc.VarType.FP32:
+            tensor_to_check_dtype = np.float32
+        elif tensor_to_check_dtype == core.VarDesc.VarType.FP64:
+            tensor_to_check_dtype = np.float64
+        elif tensor_to_check_dtype == core.VarDesc.VarType.FP16:
+            tensor_to_check_dtype = np.float16
+            # set delta as np.float16, will automatic convert to float32, float64
+            delta = np.array(delta).astype(np.float16)
+        else:
+            raise ValueError("Not supported data type " + str(
+                tensor_to_check_dtype))
+
+        def get_output():
+            sum = []
+
+            return_results = dict()
+
+            for name in (output_names):
+                return_results[name] = Manager().list()
+
+            def closure(**kwargs):
+                role = kwargs['role']
+
+                pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
+
+                executor = Executor(place)
+
+                executor.run()
+                op.run(scope, place)
+
+                for name in output_names:
+                    out  = np.array(scope.find_var(name).get_tensor())
+                    return_results[name].append(out[0])
+
+            ret = self.multi_party_run(target=closure)
+            self.assertEqual(ret[0], True)
+
+            for output_name in output_names:
+                plain = aby3.reconstruct(np.array(return_results[output_name]))
+                sum.append(plain.mean())
+
+            return tensor_to_check_dtype(np.array(sum).sum() / len(output_names))
+
+        gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
+
+        def __get_elem__(tensor, i):
+            if tensor_to_check_dtype == np.float16:
+                numpy_tensor = np.array(tensor).astype(np.float16)
+                numpy_tensor = numpy_tensor.flatten()
+                return numpy_tensor[i]
+            elif tensor_to_check_dtype == np.float32:
+                return tensor._get_float_element(i)
+            else:
+                return tensor._get_double_element(i)
+
+        def __set_elem__(tensor, i, e):
+            if tensor_to_check_dtype == np.float16:
+                numpy_tensor = np.array(tensor).astype(np.float16)
+                shape = numpy_tensor.shape
+                numpy_tensor = numpy_tensor.flatten()
+                numpy_tensor[i] = e
+                numpy_tensor = numpy_tensor.reshape(shape)
+                tensor.set(numpy_tensor, place)
+            elif tensor_to_check_dtype == np.float32:
+                tensor._set_float_element(i, e)
+            else:
+                tensor._set_double_element(i, e)
+
+        # we only compute gradient of one element each time.
+        # we use a for loop to compute the gradient of every element.
+        for i in six.moves.xrange(tensor_size):
+            if in_place:
+                set_input(scope, op, inputs, place)
+
+            # get one input element throw it's index i.
+            origin = __get_elem__(tensor_to_check, i)
+            # add delta to it, run op and then get the sum of the result tensor.
+            x_pos = origin + delta
+            __set_elem__(tensor_to_check, i, x_pos)
+            y_pos = get_output()
+
+            if in_place:
+                set_input(scope, op, inputs, place)
+
+            x_neg = origin - delta
+            __set_elem__(tensor_to_check, i, x_neg)
+            y_neg = get_output()
+
+            __set_elem__(tensor_to_check, i, origin)
+            gradient_flat[i] = (y_pos - y_neg) / delta / 2
+
+        return gradient_flat.reshape(tensor_to_check.shape())
--- a/python/paddle_fl/mpc/tests/unittests/run_test_example.sh
+++ b/python/paddle_fl/mpc/tests/unittests/run_test_example.sh
 #!/bin/bash

 # set redis server ip and port for test
-export TEST_REDIS_IP="test_redis_server_ip"
-export TEST_REDIS_PORT="test_redis_port"
+export TEST_REDIS_IP=${LOCALHOST}
+export TEST_REDIS_PORT=${REDIS_PORT}

 # unittest command
 PYTHON_TEST="python -m unittest"

 # add the modules to test
 TEST_MODULES=("test_datautils_aby3"
-"test_model_encryption"
 "test_datautils_align"
 "test_op_add"
 "test_op_sub"
@@ -21,6 +20,11 @@ TEST_MODULES=("test_datautils_aby3"
 "test_op_fc"
 "test_op_relu"
 "test_op_compare"
+"test_input_embedding"
+"test_op_softmax_with_cross_entropy"
+"test_op_batch_norm"
+"test_op_conv"
+"test_op_pool"
 )

 # run unittest

--- a/python/paddle_fl/mpc/tests/unittests/test_datautils_align.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_datautils_align.py
@@ -53,10 +53,10 @@ class TestDataUtilsAlign(unittest.TestCase):
        party_1 = Process(target=self.run_align, args=(set_1, 1, endpoints, False))
        party_2 = Process(target=self.run_align, args=(set_2, 2, endpoints, False))

-        party_0.start()
        party_1.start()
        party_2.start()
-        party_2.join()
+        party_0.start()
+        party_0.join()


 if __name__ == '__main__':

--- a/python/paddle_fl/mpc/tests/unittests/test_input_embedding.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_input_embedding.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module test embedding op.
+
+"""
+import unittest
+from multiprocessing import Manager
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle_fl.mpc as pfl_mpc
+import paddle_fl.mpc.data_utils.aby3 as aby3
+
+import test_op_base
+
+
+class TestInput(test_op_base.TestOpBase):
+
+    def gen_one_hot(self, input, depth):
+        """
+        example for generate mpc one hot tensor
+        """
+        data_var = fluid.data(name='input_data', shape=input.shape, dtype='int64')
+        ret1 = fluid.input.one_hot(input=data_var, depth=3)
+        exe =fluid.Executor(place=fluid.CPUPlace())
+        exe.run(fluid.default_startup_program())
+        data = exe.run(program=fluid.default_main_program(),feed={'input_data': input}, fetch_list=[ret1])
+        return data[0]
+
+    def embedding_op(self, **kwargs):
+        role = kwargs['role']
+        #data = kwargs['data']
+        data_normal = kwargs['data_normal']
+        data_share = kwargs['data_share'][role]
+
+        w_data = kwargs['w_data']
+        w_data_share = kwargs['w_data_share'][role]
+        return_results = kwargs['return_results']
+        expected_result = kwargs['expect_results']
+
+        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
+
+        w_param_attrs = fluid.ParamAttr(name='emb_weight',
+                                        learning_rate=0.5,
+                                        initializer=pfl_mpc.initializer.NumpyArrayInitializer(w_data_share),
+                                        trainable=True)
+        w_param_attrs1 = fluid.ParamAttr(name='emb_weight1',
+                                        learning_rate=0.5,
+                                        initializer=fluid.initializer.NumpyArrayInitializer(w_data),
+                                        trainable=True)
+        input_shape = np.delete(data_share.shape, 0, 0)
+        data1 = pfl_mpc.data(name='input', shape=input_shape, dtype='int64')
+        data2 = fluid.data(name='input1', shape=data_normal.shape, dtype='int64')
+
+        math_embedding = fluid.input.embedding(input=data2, size=w_data.shape, param_attr=w_param_attrs1, dtype='float32')
+
+        op_embedding = pfl_mpc.input.embedding(input=data1, size=(input_shape[1],input_shape[0]), param_attr=w_param_attrs, dtype='int64')
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        exe.run(fluid.default_startup_program())
+
+        results = exe.run(feed={'input': data_share, 'input1': data_normal}, fetch_list=[op_embedding, math_embedding])
+
+        return_results.append(results[0])
+        expected_result.append(results[1])
+
+    def test_embedding_op(self):
+        data = np.array([[1, 0, 0], [0, 1, 0]])
+        data_normal = np.array([0, 1]).astype('int64')
+        w_data = np.array([[1, 2], [2, 3], [3, 4]])
+
+        # data = self.gen_one_hot(data_normal, w_data.shape[0]).astype('int64')
+
+        data_share = aby3.make_shares(np.array(data))
+        data_all3shares = np.array([aby3.get_aby3_shares(data_share, i) for i in range(3)])
+        w_data_share = aby3.make_shares(w_data)
+        w_data_all3shares = np.array([aby3.get_aby3_shares(w_data_share, i) for i in range(3)])
+
+        return_results = Manager().list()
+        expect_results = Manager().list()
+        ret = self.multi_party_run(target=self.embedding_op,
+                                   data=data,
+                                   data_normal=data_normal,
+                                   w_data=w_data,
+                                   data_share=data_all3shares,
+                                   w_data_share=w_data_all3shares,
+                                   return_results=return_results,
+                                   expect_results=expect_results)
+        self.assertEqual(ret[0], True)
+        revealed = aby3.reconstruct(np.array(return_results))
+        # print("reveal: ", revealed)
+        self.assertTrue(np.allclose(revealed, expect_results[0], atol=1e-4))
+
+    def test_mpc_one_hot(self):
+      data = np.array([0, 1]).astype('int64')
+      ret = self.gen_one_hot(data, 3)
+      mpc_one_hot = aby3.make_shares(ret)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle_fl/mpc/tests/unittests/test_op_add.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_add.py
@@ -91,6 +91,28 @@ class TestOpAdd(test_op_base.TestOpBase):
        self.assertEqual(results[0].shape, (2, 3, 4))
        return_results.append(results[0])

+    def diff_dim_add_mid(self, **kwargs):
+        """
+        Add with different dimensions.
+        :param kwargs:
+        :return:
+        """
+        role = kwargs['role']
+        d_1 = kwargs['data_1'][role]
+        d_2 = kwargs['data_2'][role]
+        return_results = kwargs['return_results']
+
+        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
+        x = pfl_mpc.data(name='x', shape=[3, 4, 2], dtype='int64')
+        y = pfl_mpc.data(name='y', shape=[4], dtype='int64')
+        # math_add = x + y
+        math_add = pfl_mpc.layers.elementwise_add(x, y, axis=1)
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        results = exe.run(feed={'x': d_1, 'y': d_2}, fetch_list=[math_add])
+
+        self.assertEqual(results[0].shape, (2, 3, 4, 2))
+        return_results.append(results[0])
+
    def test_elementwise_add(self):
        data_1 = [np.array([[0, 1, 2, 3],
                            [0, 1, 2, 3]]).astype('int64')] * self.party_num
@@ -117,6 +139,7 @@ class TestOpAdd(test_op_base.TestOpBase):
                                   expect_results=expect_results)
        self.assertEqual(ret[0], True)

+
    def test_diff_dim_add(self):
        data_1 = np.full((3, 4), fill_value=2)
        data_2 = np.ones((4,))
@@ -135,6 +158,29 @@ class TestOpAdd(test_op_base.TestOpBase):
        expected_out = np.array([[3, 3, 3, 3], [3, 3, 3, 3], [3, 3, 3, 3]])
        self.assertTrue(np.allclose(revealed, expected_out, atol=1e-4))
    
+    def test_diff_dim_add_mid(self):
+        data_1 = np.full((3, 4, 2), fill_value=2)
+        data_2 = np.ones((4,))
+        # print(data_1)
+        # print(data_2)
+        data_1_shares = aby3.make_shares(data_1)
+        data_2_shares = aby3.make_shares(data_2)
+        data_1_all3shares = np.array([aby3.get_aby3_shares(data_1_shares, i) for i in range(3)])
+        data_2_all3shares = np.array([aby3.get_aby3_shares(data_2_shares, i) for i in range(3)])
+
+        return_results = Manager().list()
+        ret = self.multi_party_run(target=self.diff_dim_add_mid,
+                                   data_1=data_1_all3shares,
+                                   data_2=data_2_all3shares,
+                                   return_results=return_results)
+        self.assertEqual(ret[0], True)
+        revealed = aby3.reconstruct(np.array(return_results))
+        # print(revealed)
+        expected_out = np.array([[[3, 3], [3, 3], [3, 3], [3, 3]], 
+            [[3, 3], [3, 3], [3, 3], [3, 3]], 
+            [[3, 3], [3, 3], [3, 3], [3, 3]]])
+        self.assertTrue(np.allclose(revealed, expected_out, atol=1e-4))
+    
    def test_elementwise_add_dim_error(self):
        data_1 = [np.array([0, 1, 2, 3]).astype('int64')] * self.party_num
        data_2 = [np.array([4, 3, 2, 1]).astype('int64')] * self.party_num

--- a/python/paddle_fl/mpc/tests/unittests/test_op_batch_norm.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_batch_norm.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module test add op.
+
+"""
+import unittest
+from multiprocessing import Manager
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle_fl.mpc as pfl_mpc
+import paddle_fl.mpc.data_utils.aby3 as aby3
+
+import test_op_base
+
+
+class TestOpBatchNorm(test_op_base.TestOpBase):
+
+    def batch_norm(self, **kwargs):
+        """
+        Add two variables with one dimension.
+        :param kwargs:
+        :return:
+        """
+        role = kwargs['role']
+        d_1 = kwargs['data_1'][role]
+        return_results = kwargs['return_results']
+
+        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
+        x = pfl_mpc.data(name='x', shape=[2, 3], dtype='int64')
+
+        param_attr = fluid.ParamAttr(name='batch_norm_w', initializer=fluid.initializer.ConstantInitializer(value=21845))
+        bias_attr = fluid.ParamAttr(name='batch_norm_b', initializer=fluid.initializer.ConstantInitializer(value=0))
+        bn_out = pfl_mpc.layers.batch_norm(input=x, param_attr = param_attr, bias_attr = bias_attr)
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        exe.run(fluid.default_startup_program())
+        results = exe.run(feed={'x': d_1}, fetch_list=[bn_out])
+
+        self.assertEqual(results[0].shape, (2, 2, 3))
+        return_results.append(results[0])
+
+
+    def test_batch_norm(self):
+
+        data_1 = np.array(
+            [[10, 10, 10], [50, 50, 50]]).astype('float32')
+        
+        expected_out = np.array(
+            [[-1, -1, -1], [1, 1, 1]]).astype('float32')
+        # print("input data_1: {} \n".format(data_1))
+
+        data_1_shares = aby3.make_shares(data_1)
+
+        data_1_all3shares = np.array([aby3.get_aby3_shares(data_1_shares, i) for i in range(3)])
+
+        return_results = Manager().list()
+        ret = self.multi_party_run(target=self.batch_norm,
+                                   data_1=data_1_all3shares,
+                                   return_results=return_results)
+
+        self.assertEqual(ret[0], True)
+        revealed = aby3.reconstruct(np.array(return_results))
+        # print("revealed: {} \n".format(revealed))
+        # print("expected: {} \n".format(expected_out))
+        self.assertTrue(np.allclose(revealed, expected_out, atol=1e-2))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle_fl/mpc/tests/unittests/test_op_compare.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_compare.py
@@ -39,7 +39,7 @@ class TestOpCompare(test_op_base.TestOpBase):

        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
        x = pfl_mpc.data(name='x', shape=[3], dtype='int64')
-        y = fluid.data(name='y', shape=[1, 3], dtype='float32')
+        y = fluid.data(name='y', shape=[3], dtype='float32')
        # todo: reshape y to [3]
        op_gt = pfl_mpc.layers.greater_than(x=x, y=y)
        math_gt = x > y
@@ -47,7 +47,7 @@ class TestOpCompare(test_op_base.TestOpBase):
        results = exe.run(feed={'x': d_1, 'y': d_2}, fetch_list=[op_gt, math_gt])

        self.assertTrue(np.allclose(results[0], results[1]))
-        self.assertEqual(results[0].shape, (1, 3))
+        self.assertEqual(results[0].shape, (3, ))
        self.assertTrue(np.allclose(results[0], expected_out))

    def ge(self, **kwargs):
@@ -63,14 +63,14 @@ class TestOpCompare(test_op_base.TestOpBase):

        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
        x = pfl_mpc.data(name='x', shape=[3], dtype='int64')
-        y = fluid.data(name='y', shape=[1, 3], dtype='float32')
+        y = fluid.data(name='y', shape=[3], dtype='float32')
        op_ge = pfl_mpc.layers.greater_equal(x=x, y=y)
        math_ge = x >= y
        exe = fluid.Executor(place=fluid.CPUPlace())
        results = exe.run(feed={'x': d_1, 'y': d_2}, fetch_list=[op_ge, math_ge])

        self.assertTrue(np.allclose(results[0], results[1]))
-        self.assertEqual(results[0].shape, (1, 3))
+        self.assertEqual(results[0].shape, (3, ))
        self.assertTrue(np.allclose(results[0], expected_out))

    def lt(self, **kwargs):
@@ -86,14 +86,14 @@ class TestOpCompare(test_op_base.TestOpBase):

        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
        x = pfl_mpc.data(name='x', shape=[3], dtype='int64')
-        y = fluid.data(name='y', shape=[1, 3], dtype='float32')
+        y = fluid.data(name='y', shape=[3], dtype='float32')
        op_lt = pfl_mpc.layers.less_than(x=x, y=y)
        math_lt = x < y
        exe = fluid.Executor(place=fluid.CPUPlace())
        results = exe.run(feed={'x': d_1, 'y': d_2}, fetch_list=[op_lt, math_lt])

        self.assertTrue(np.allclose(results[0], results[1]))
-        self.assertEqual(results[0].shape, (1, 3))
+        self.assertEqual(results[0].shape, (3, ))
        self.assertTrue(np.allclose(results[0], expected_out))

    def le(self, **kwargs):
@@ -109,14 +109,14 @@ class TestOpCompare(test_op_base.TestOpBase):

        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
        x = pfl_mpc.data(name='x', shape=[3], dtype='int64')
-        y = fluid.data(name='y', shape=[1, 3], dtype='float32')
+        y = fluid.data(name='y', shape=[3], dtype='float32')
        op_le = pfl_mpc.layers.less_equal(x=x, y=y)
        math_le = x <= y
        exe = fluid.Executor(place=fluid.CPUPlace())
        results = exe.run(feed={'x': d_1, 'y': d_2}, fetch_list=[op_le, math_le])

        self.assertTrue(np.allclose(results[0], results[1]))
-        self.assertEqual(results[0].shape, (1, 3))
+        self.assertEqual(results[0].shape, (3, ))
        self.assertTrue(np.allclose(results[0], expected_out))

    def equal(self, **kwargs):
@@ -132,14 +132,14 @@ class TestOpCompare(test_op_base.TestOpBase):

        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
        x = pfl_mpc.data(name='x', shape=[3], dtype='int64')
-        y = fluid.data(name='y', shape=[1, 3], dtype='float32')
+        y = fluid.data(name='y', shape=[3], dtype='float32')
        op_eq = pfl_mpc.layers.equal(x=x, y=y)
        math_eq = x == y
        exe = fluid.Executor(place=fluid.CPUPlace())
        results = exe.run(feed={'x': d_1, 'y': d_2}, fetch_list=[op_eq, math_eq])

        self.assertTrue(np.allclose(results[0], results[1]))
-        self.assertEqual(results[0].shape, (1, 3))
+        self.assertEqual(results[0].shape, (3, ))
        self.assertTrue(np.allclose(results[0], expected_out))

    def not_equal(self, **kwargs):
@@ -155,21 +155,21 @@ class TestOpCompare(test_op_base.TestOpBase):

        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
        x = pfl_mpc.data(name='x', shape=[3], dtype='int64')
-        y = fluid.data(name='y', shape=[1, 3], dtype='float32')
+        y = fluid.data(name='y', shape=[3], dtype='float32')
        op_ne = pfl_mpc.layers.not_equal(x=x, y=y)
        math_ne = x != y
        exe = fluid.Executor(place=fluid.CPUPlace())
        results = exe.run(feed={'x': d_1, 'y': d_2}, fetch_list=[op_ne, math_ne])

        self.assertTrue(np.allclose(results[0], results[1]))
-        self.assertEqual(results[0].shape, (1, 3))
+        self.assertEqual(results[0].shape, (3, ))
        self.assertTrue(np.allclose(results[0], expected_out))

    def test_gt(self):
        data_1 = [np.array([[65536, 65536, 65536],
                            [65536, 65536, 65536]]).astype('int64')] * self.party_num
-        data_2 = [np.array([[5, 3, 2]]).astype('float32')] * self.party_num
-        expect_results = [np.array([[0, 0, 1]])] * self.party_num
+        data_2 = [np.array([5, 3, 2]).astype('float32')] * self.party_num
+        expect_results = [np.array([0, 0, 1])] * self.party_num
        ret = self.multi_party_run(target=self.gt,
                                   data_1=data_1,
                                   data_2=data_2,
@@ -179,8 +179,8 @@ class TestOpCompare(test_op_base.TestOpBase):
    def test_ge(self):
        data_1 = [np.array([[65536, 65536, 65536],
                            [65536, 65536, 65536]]).astype('int64')] * self.party_num
-        data_2 = [np.array([[5, 3, 2]]).astype('float32')] * self.party_num
-        expect_results = [np.array([[0, 1, 1]])] * self.party_num
+        data_2 = [np.array([5, 3, 2]).astype('float32')] * self.party_num
+        expect_results = [np.array([0, 1, 1])] * self.party_num
        ret = self.multi_party_run(target=self.ge,
                                   data_1=data_1,
                                   data_2=data_2,
@@ -190,8 +190,8 @@ class TestOpCompare(test_op_base.TestOpBase):
    def test_lt(self):
        data_1 = [np.array([[65536, 65536, 65536],
                            [65536, 65536, 65536]]).astype('int64')] * self.party_num
-        data_2 = [np.array([[5, 3, 2]]).astype('float32')] * self.party_num
-        expect_results = [np.array([[1, 0, 0]])] * self.party_num
+        data_2 = [np.array([5, 3, 2]).astype('float32')] * self.party_num
+        expect_results = [np.array([1, 0, 0])] * self.party_num
        ret = self.multi_party_run(target=self.lt,
                                   data_1=data_1,
                                   data_2=data_2,
@@ -201,8 +201,8 @@ class TestOpCompare(test_op_base.TestOpBase):
    def test_le(self):
        data_1 = [np.array([[65536, 65536, 65536],
                            [65536, 65536, 65536]]).astype('int64')] * self.party_num
-        data_2 = [np.array([[5, 3, 2]]).astype('float32')] * self.party_num
-        expect_results = [np.array([[1, 1, 0]])] * self.party_num
+        data_2 = [np.array([5, 3, 2]).astype('float32')] * self.party_num
+        expect_results = [np.array([1, 1, 0])] * self.party_num
        ret = self.multi_party_run(target=self.le,
                                   data_1=data_1,
                                   data_2=data_2,
@@ -212,8 +212,8 @@ class TestOpCompare(test_op_base.TestOpBase):
    def test_equal(self):
        data_1 = [np.array([[65536, 65536, 65536],
                            [65536, 65536, 65536]]).astype('int64')] * self.party_num
-        data_2 = [np.array([[5, 3, 2]]).astype('float32')] * self.party_num
-        expect_results = [np.array([[0, 1, 0]])] * self.party_num
+        data_2 = [np.array([5, 3, 2]).astype('float32')] * self.party_num
+        expect_results = [np.array([0, 1, 0])] * self.party_num
        ret = self.multi_party_run(target=self.equal,
                                   data_1=data_1,
                                   data_2=data_2,
@@ -223,8 +223,8 @@ class TestOpCompare(test_op_base.TestOpBase):
    def test_not_equal(self):
        data_1 = [np.array([[65536, 65536, 65536],
                            [65536, 65536, 65536]]).astype('int64')] * self.party_num
-        data_2 = [np.array([[5, 3, 2]]).astype('float32')] * self.party_num
-        expect_results = [np.array([[1, 0, 1]])] * self.party_num
+        data_2 = [np.array([5, 3, 2]).astype('float32')] * self.party_num
+        expect_results = [np.array([1, 0, 1])] * self.party_num
        ret = self.multi_party_run(target=self.not_equal,
                                   data_1=data_1,
                                   data_2=data_2,

--- a/python/paddle_fl/mpc/tests/unittests/test_op_conv.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_conv.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module test conv op.
+
+"""
+import unittest
+from multiprocessing import Manager
+import numpy as np
+
+
+import test_op_base
+from op_test import OpTest
+import paddle_fl.mpc.data_utils.aby3 as aby3
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+def conv2d_forward_naive(input,
+                         filter,
+                         group,
+                         conv_param,
+                         padding_algorithm='EXPLICIT',
+                         data_format='NCHW'):
+    if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+        raise ValueError("Unknown Attr(padding_algorithm): '%s'. "
+                         "It can only be 'SAME' or 'VALID'." %
+                         str(padding_algorithm))
+
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Unknown Attr(data_format): '%s' ."
+                         "It can only be 'NCHW' or 'NHWC'." % str(data_format))
+
+    channel_last = (data_format == "NHWC")
+    if channel_last:
+        input = np.transpose(input, [0, 3, 1, 2])
+
+    in_n, in_c, in_h, in_w = input.shape
+    f_n, f_c, f_h, f_w = filter.shape
+    out_n = in_n
+    out_c = f_n
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+    sub_out_c = out_c // group
+    sub_f_n = f_n // group
+
+    stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
+        'dilation']
+
+    # update pad and dilation
+    def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(input_shape, pool_size,
+                                                        pool_stride):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max((
+                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    ksize = filter.shape[2:4]
+    if padding_algorithm == "VALID":
+        pad = [0, 0, 0, 0]
+    elif padding_algorithm == "SAME":
+        dilation = [1, 1]
+        input_data_shape = input.shape[2:4]
+        pad = _get_padding_with_SAME(input_data_shape, ksize, stride)
+
+    pad_h_0, pad_h_1 = pad[0], pad[0]
+    pad_w_0, pad_w_1 = pad[1], pad[1]
+    if len(pad) == 4:
+        pad_h_0, pad_h_1 = pad[0], pad[1]
+        pad_w_0, pad_w_1 = pad[2], pad[3]
+    out_h = 1 + (in_h + pad_h_0 + pad_h_1 - (dilation[0] *
+                                             (f_h - 1) + 1)) // stride[0]
+    out_w = 1 + (in_w + pad_w_0 + pad_w_1 - (dilation[1] *
+                                             (f_w - 1) + 1)) // stride[1]
+    out = np.zeros((out_n, out_c, out_h, out_w))
+
+    d_bolck_h = (dilation[0] * (f_h - 1) + 1)
+    d_bolck_w = (dilation[1] * (f_w - 1) + 1)
+
+    input_pad = np.pad(input, ((0, 0), (0, 0), (pad_h_0, pad_h_1),
+                               (pad_w_0, pad_w_1)),
+                       mode='constant',
+                       constant_values=0)
+
+    filter_dilation = np.zeros((f_n, f_c, d_bolck_h, d_bolck_w))
+    filter_dilation[:, :, 0:d_bolck_h:dilation[0], 0:d_bolck_w:dilation[
+        1]] = filter
+
+    for i in range(out_h):
+        for j in range(out_w):
+            for g in range(group):
+                input_pad_masked = \
+                    input_pad[:, g * f_c:(g + 1) * f_c,
+                    i * stride[0]:i * stride[0] + d_bolck_h,
+                    j * stride[1]:j * stride[1] + d_bolck_w]
+
+                f_sub = filter_dilation[g * sub_f_n:(g + 1) * sub_f_n, :, :, :]
+                # sub_f_n == sub_out_c
+                for k in range(sub_out_c):
+                    # Multiplication of Corresponding Elements, then sum all
+                    out[:, g * sub_out_c + k, i, j] = \
+                        np.sum(input_pad_masked * f_sub[k, :, :, :],
+                               axis=(1, 2, 3))
+
+    if channel_last:
+        out = np.transpose(out, [0, 2, 3, 1])
+
+    return out, in_n, out_h, out_w, out_c
+
+
+def create_test_channel_last_class(parent):
+    class TestChannelLastCase(parent):
+        def init_data_format(self):
+            self.data_format = "NHWC"
+
+        def init_test_case_2(self):
+            N, C, H, W = self.input_size
+            self.input_size = [N, H, W, C]
+
+    cls_name = "{0}_{1}".format(parent.__name__, "ChannelLast")
+    TestChannelLastCase.__name__ = cls_name
+    globals()[cls_name] = TestChannelLastCase
+
+def create_test_padding_SAME_class(parent):
+    class TestPaddingSMAECase(parent):
+        def init_paddings(self):
+            self.pad = [0, 0]
+            self.padding_algorithm = "SAME"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingSAMEOp")
+    TestPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingSMAECase
+
+
+def create_test_padding_VALID_class(parent):
+    class TestPaddingVALIDCase(parent):
+        def init_paddings(self):
+            self.pad = [1, 1]
+            self.padding_algorithm = "VALID"
+        def test_check_grad(self):
+            error = 0.09
+            if parent.__name__ in ["TestConv2dOp_AsyPadding",
+                    "TestWithStride_AsyPadding"]:
+                error = 0.14
+            elif parent.__name__ in ["TestWithInput1x1Filter1x1_AsyPadding"]:
+                error = 0.66
+            place = core.CPUPlace()
+            self.check_grad_with_place(
+                place, {'Input', 'Filter'},
+                'Output',
+                max_relative_error=error)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingVALIDOp")
+    TestPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingVALIDCase
+
+class TestConv2dOp(OpTest):
+    def setUp(self):
+        OpTest.setUp(self)
+        self.op_type = "mpc_conv2d"
+        self.data_format = "AnyLayout"
+        self.dtype = np.int64
+        self.init_kernel_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv2d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilation': self.dilations
+        }
+
+        share = lambda x: np.array([x * 65536/3] * 2).astype('int64')
+
+        input = np.random.random(self.input_size)
+        filter = np.random.uniform(-1, 1, self.filter_size)
+        output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups,
+                                                  conv2d_param)
+        input = share(input)
+        filter = share(filter)
+
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+        }
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations,
+            'data_format': self.data_format,
+        }
+        self.outputs = {'Output': output}
+
+
+    def test_check_output(self):
+        place = core.CPUPlace()
+        self.check_output_with_place(
+            place, atol=1e-3)
+
+    def test_check_grad(self):
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, {'Input', 'Filter'},
+            'Output',
+            max_relative_error=0.07)
+
+    def test_check_grad_no_filter(self):
+        place =  core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.07,
+            no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        place =  core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Filter'],
+            'Output',
+            max_relative_error=0.06,
+            no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_test_case_2(self):
+        pass
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_kernel_type(self):
+        pass
+
+
+class TestWithPad(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithStride(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithGroup(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.group = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [18, f_c, 3, 3]
+
+
+class TestWith1x1(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def test_check_grad(self):
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, {'Input', 'Filter'},
+            'Output',
+            max_relative_error=0.6)
+
+    def test_check_grad_no_filter(self):
+        place =  core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.9,
+            no_grad_set=set(['Filter']))
+
+class TestWithDilation(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithInput1x1Filter1x1(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [100, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def test_check_grad(self):
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, {'Input', 'Filter'},
+            'Output',
+            max_relative_error=0.75)
+
+class TestConv2dOp_v2(OpTest):
+    def setUp(self):
+        self.op_type = "mpc_conv2d"
+        self.dtype = np.int64
+        self.init_kernel_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_data_format()
+        self.init_test_case()
+        self.init_paddings()
+        self.init_test_case_2()
+
+        conv2d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilation': self.dilations
+        }
+
+        share = lambda x: np.array([x * 65536/3] * 2).astype('int64')
+
+        input = np.random.random(self.input_size)
+        filter = np.random.uniform(-1, 1, self.filter_size)
+
+
+        output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups,
+                                                  conv2d_param, self.padding_algorithm, self.data_format)
+
+        input = share(input)
+        filter = share(filter)
+
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+        }
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'padding_algorithm': self.padding_algorithm,
+            'groups': self.groups,
+            'dilations': self.dilations,
+            'data_format': self.data_format
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        place = core.CPUPlace()
+        self.check_output_with_place(
+            place, atol=1e-3)
+
+    def test_check_grad(self):
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, {'Input', 'Filter'},
+            'Output',
+            max_relative_error=0.14)
+
+    def test_check_grad_no_filter(self):
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.13,
+            no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Filter'],
+            'Output',
+            max_relative_error=0.7,
+            no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 4, 3]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_kernel_type(self):
+        pass
+
+    def init_paddings(self):
+        self.pad = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+    def init_data_format(self):
+        self.data_format = "NCHW"
+
+    def init_test_case_2(self):
+        pass
+
+
+class TestConv2dOp_AsyPadding(TestConv2dOp_v2):
+    def init_paddings(self):
+        self.pad = [0, 0, 1, 2]
+        self.padding_algorithm = "EXPLICIT"
+    def test_check_grad(self):
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, {'Input', 'Filter'},
+            'Output',
+            max_relative_error=0.09)
+
+
+class TestWithPad_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithStride_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithGroup_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.group = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 4, 3]
+
+
+class TestWith1x1_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [2, 2, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise3x3_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [3, 4, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [16, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 4
+
+    def init_paddings(self):
+        self.pad = [1, 3, 2, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise5x5_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 4, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [8, f_c, 5, 5]
+
+    def init_group(self):
+        self.groups = 4
+
+    def init_paddings(self):
+        self.pad = [0, 1, 1, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise7x7_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 8, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [16, f_c, 7, 7]
+
+    def init_group(self):
+        self.groups = 8
+
+    def init_paddings(self):
+        self.pad = [1, 3, 4, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDilation_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [0, 1, 3, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithInput1x1Filter1x1_AsyPadding(TestConv2dOp_v2):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [40, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [120, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [0, 3, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+    def test_check_grad(self):
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, {'Input', 'Filter'},
+            'Output',
+            max_relative_error=0.7)
+
+    def test_check_grad_no_filter(self):
+        place =  core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.7,
+            no_grad_set=set(['Filter']))
+
+
+#---------- test SAME VALID -----------
+create_test_padding_SAME_class(TestConv2dOp_AsyPadding)
+create_test_padding_SAME_class(TestWithPad_AsyPadding)
+create_test_padding_SAME_class(TestWithStride_AsyPadding)
+create_test_padding_SAME_class(TestWithGroup_AsyPadding)
+create_test_padding_SAME_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+create_test_padding_VALID_class(TestConv2dOp_AsyPadding)
+create_test_padding_VALID_class(TestWithPad_AsyPadding)
+create_test_padding_VALID_class(TestWithStride_AsyPadding)
+create_test_padding_VALID_class(TestWithGroup_AsyPadding)
+create_test_padding_VALID_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+# ------------ test channel last ---------
+create_test_channel_last_class(TestConv2dOp_AsyPadding)
+create_test_channel_last_class(TestWithPad_AsyPadding)
+create_test_channel_last_class(TestWithGroup_AsyPadding)
+create_test_channel_last_class(TestWith1x1_AsyPadding)
+create_test_channel_last_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle_fl/mpc/tests/unittests/test_op_dynamic_gru.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_dynamic_gru.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module test dyanmic_gru op.
+
+"""
+import unittest
+from multiprocessing import Manager
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle_fl.mpc as pfl_mpc
+import paddle_fl.mpc.data_utils.aby3 as aby3
+
+import test_op_base
+
+
+class TestInput(test_op_base.TestOpBase):
+
+    def dyanmic_gru_op(self, **kwargs):
+        role = kwargs['role']
+        data = kwargs['data']
+        data_share = kwargs['data_share'][role]
+        weight = kwargs['weight']
+        weight_share = kwargs['weight_share'][role]
+        return_results = kwargs['return_results']
+        return_results_cheb = kwargs['return_results_cheb']
+        expected_result = kwargs['expect_results']
+        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
+
+        hidden_dim = 1
+
+        data_paddle = fluid.data(name='input_paddle', shape=[3, 3], dtype='float32', lod_level=1)
+        ldata_paddle = fluid.create_lod_tensor(data, [[3]], fluid.CPUPlace())
+        w_param_attrs = fluid.ParamAttr(name='gru_weight',
+                                        learning_rate=0.5,
+                                        initializer=fluid.initializer.NumpyArrayInitializer(weight),
+                                        trainable=True)
+        hidden_paddle = fluid.layers.dynamic_gru(input=data_paddle, size=hidden_dim, param_attr=w_param_attrs,
+                                                 gate_activation='sigmoid', candidate_activation='relu')
+
+        data_mpc = fluid.data(name='input_mpc', shape=[3, 2, 3], dtype='int64', lod_level=1)
+        # trans batch information to shape[0]
+        data_share_trans = np.transpose(data_share, [1, 0, 2])
+        ldata_mpc = fluid.create_lod_tensor(data_share_trans, [[3]], fluid.CPUPlace())
+        w_param_attrs1 = fluid.ParamAttr(name='mpc_gru_weight',
+                                        learning_rate=0.5,
+                                        initializer=pfl_mpc.initializer.NumpyArrayInitializer(weight_share),
+                                        trainable=True)
+        w_param_attrs2 = fluid.ParamAttr(name='mpc_gru_weight_cheb',
+                                        learning_rate=0.5,
+                                        initializer=pfl_mpc.initializer.NumpyArrayInitializer(weight_share),
+                                        trainable=True)
+        hidden_mpc = pfl_mpc.layers.dynamic_gru(input=data_mpc, size=hidden_dim,
+                                                param_attr=w_param_attrs1)
+        hidden_mpc_cheb = pfl_mpc.layers.dynamic_gru(input=data_mpc, size=hidden_dim,
+                                                param_attr=w_param_attrs2, gate_activation='sigmoid_chebyshev')
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        exe.run(fluid.default_startup_program())
+        results = exe.run(feed={'input_paddle': ldata_paddle, 'input_mpc': ldata_mpc},
+                        fetch_list=[hidden_paddle, hidden_mpc, hidden_mpc_cheb], return_numpy=False)
+        return_results.append(np.array(results[1]))
+        return_results_cheb.append(np.array(results[2]))
+        expected_result.append(np.array(results[0]))
+
+    def test_dyanmic_gru_op(self):
+        data = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [-1.0, -2.0, -3.0]]).astype('float32')
+        data_share = aby3.make_shares(data)
+        data_all3shares = np.array([aby3.get_aby3_shares(data_share, i) for i in range(3)])
+
+        weight = np.array([[0.0, 0.0, 0.0]]).astype('float32')
+        weight_share = aby3.make_shares(weight)
+        weight_all3shares = np.array([aby3.get_aby3_shares(weight_share, i) for i in range(3)])
+
+        return_results = Manager().list()
+        return_results_cheb = Manager().list()
+        expect_results = Manager().list()
+        ret = self.multi_party_run(target=self.dyanmic_gru_op,
+                                   data=data,
+                                   data_share = data_all3shares,
+                                   weight=weight,
+                                   weight_share=weight_all3shares,
+                                   return_results=return_results,
+                                   return_results_cheb=return_results_cheb,
+                                   expect_results=expect_results)
+        self.assertEqual(ret[0], True)
+        revealed = aby3.reconstruct(np.array(return_results))
+        revealed_cheb = aby3.reconstruct(np.array(return_results_cheb))
+        print("expected:", expect_results[0])
+        print("reveal: ", revealed)
+        print("reveal_cheb: ", revealed_cheb)
+        self.assertTrue(np.allclose(revealed, expect_results[0], atol=1e-1*5))
+        self.assertTrue(np.allclose(revealed_cheb, expect_results[0], atol=1e-1*5))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle_fl/mpc/tests/unittests/test_op_pool.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_pool.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module test add op.
+
+"""
+import unittest
+from multiprocessing import Manager
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle_fl.mpc as pfl_mpc
+import paddle_fl.mpc.data_utils.aby3 as aby3
+
+import test_op_base
+
+
+class TestOpPool2d(test_op_base.TestOpBase):
+
+    def pool2d(self, **kwargs):
+        """
+        Add two variables with one dimension.
+        :param kwargs:
+        :return:
+        """
+        role = kwargs['role']
+        d_1 = kwargs['data_1'][role]
+        return_results = kwargs['return_results']
+
+        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
+        x = pfl_mpc.data(name='x', shape=[1, 1, 4, 6], dtype='int64')
+
+        pool_out = pfl_mpc.layers.pool2d(input=x, pool_size=2, pool_stride=2)
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        exe.run(fluid.default_startup_program())
+        results = exe.run(feed={'x': d_1}, fetch_list=[pool_out])
+
+        self.assertEqual(results[0].shape, (2, 1, 1, 2, 3))
+        return_results.append(results[0])
+
+
+    def test_pool2d(self):
+
+        data_1 = np.array(
+            [[[[1, 2, 3, 4, 0, 100], 
+               [5, 6, 7, 8, 0, 100], 
+               [9, 10, 11, 12, 0, 200],
+               [13, 14, 15, 16, 0, 200]]]]).astype('float32')
+        
+        expected_out = np.array(
+            [[[[6, 8, 100], 
+               [14, 16, 200]]]]).astype('float32')
+        print("input data_1: {} \n".format(data_1))
+
+        data_1_shares = aby3.make_shares(data_1)
+
+        data_1_all3shares = np.array([aby3.get_aby3_shares(data_1_shares, i) for i in range(3)])
+
+        return_results = Manager().list()
+        ret = self.multi_party_run(target=self.pool2d,
+                                   data_1=data_1_all3shares,
+                                   return_results=return_results)
+
+        self.assertEqual(ret[0], True)
+        revealed = aby3.reconstruct(np.array(return_results))
+        #print("revealed: {} \n".format(revealed))
+        #print("expected: {} \n".format(expected_out))
+        self.assertTrue(np.allclose(revealed, expected_out, atol=1e-2))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle_fl/mpc/tests/unittests/test_op_softmax_with_cross_entropy.py
+++ b/python/paddle_fl/mpc/tests/unittests/test_op_softmax_with_cross_entropy.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module test add op.
+
+"""
+import unittest
+from multiprocessing import Manager
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle_fl.mpc as pfl_mpc
+import paddle_fl.mpc.data_utils.aby3 as aby3
+
+import test_op_base
+
+
+class TestOpSoftmaxWithCrossEntropy(test_op_base.TestOpBase):
+
+    def softmax_with_cross_entropy(self, **kwargs):
+        """
+        Add two variables with one dimension.
+        :param kwargs:
+        :return:
+        """
+        role = kwargs['role']
+        d_1 = kwargs['data_1'][role]
+        d_2 = kwargs['data_2'][role]
+        return_results = kwargs['return_results']
+
+        pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port))
+        x = pfl_mpc.data(name='x', shape=[2], dtype='int64')
+        y = pfl_mpc.data(name='y', shape=[2], dtype='int64')
+        cost, softmax = pfl_mpc.layers.softmax_with_cross_entropy(x, y, soft_label=True, return_softmax=True)
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        results = exe.run(feed={'x': d_1, 'y': d_2}, fetch_list=[softmax])
+
+        self.assertEqual(results[0].shape, (2, 2))
+        return_results.append(results[0])
+
+
+    def test_softmax_with_cross_entropy(self):
+
+        data_1 = np.array(
+            [1, 1]).astype('float32')
+        data_2 = np.array(
+            [1, 0]).astype('float32')
+        
+        expected_out = np.array(
+            [0.5, 0.5]).astype('float32')
+        #print("input data_1: {} \n".format(data_1))
+
+        data_1_shares = aby3.make_shares(data_1)
+        data_2_shares = aby3.make_shares(data_2)
+
+        data_1_all3shares = np.array([aby3.get_aby3_shares(data_1_shares, i) for i in range(3)])
+        data_2_all3shares = np.array([aby3.get_aby3_shares(data_2_shares, i) for i in range(3)])
+
+        return_results = Manager().list()
+        ret = self.multi_party_run(target=self.softmax_with_cross_entropy,
+                                   data_1=data_1_all3shares,
+                                   data_2=data_2_all3shares,
+                                   return_results=return_results)
+
+        self.assertEqual(ret[0], True)
+        revealed = aby3.reconstruct(np.array(return_results))
+        #print("revealed: {} \n".format(revealed))
+        #print("expected: {} \n".format(expected_out))
+        self.assertTrue(np.allclose(revealed, expected_out, atol=1e-4))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle_fl/mpc/tests/unittests/testsuite.py
+++ b/python/paddle_fl/mpc/tests/unittests/testsuite.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def create_op(scope, op_type, inputs, outputs, attrs, cache_list=None):
+    kwargs = dict()
+
+    op_maker = core.op_proto_and_checker_maker
+    op_role_attr_name = op_maker.kOpRoleAttrName()
+
+    if op_role_attr_name not in attrs:
+        attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
+
+    def __create_var__(name, var_name):
+        scope.var(var_name).get_tensor()
+        kwargs[name].append(var_name)
+
+    for in_name, in_dup in Operator.get_op_inputs(op_type):
+        if in_name in inputs:
+            kwargs[in_name] = []
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, _ = item[0], item[1]
+                    __create_var__(in_name, sub_in_name)
+            else:
+                __create_var__(in_name, in_name)
+    if cache_list != None and isinstance(cache_list, list):
+        for name in cache_list:
+            kwargs[name] = []
+            scope.var(name)
+            kwargs[name].append(name)
+
+    for out_name, out_dup in Operator.get_op_outputs(op_type):
+        if out_name in outputs:
+            kwargs[out_name] = []
+            if out_dup:
+                sub_out = outputs[out_name]
+                for item in sub_out:
+                    sub_out_name, _ = item[0], item[1]
+                    __create_var__(out_name, sub_out_name)
+            else:
+                __create_var__(out_name, out_name)
+
+    for attr_name in Operator.get_op_attr_names(op_type):
+        if attr_name in attrs:
+            kwargs[attr_name] = attrs[attr_name]
+
+    return Operator(op_type, **kwargs)
+
+
+def set_input(scope, op, inputs, place):
+    def __set_input__(var_name, var):
+        if isinstance(var, tuple) or isinstance(var, np.ndarray):
+            tensor = scope.find_var(var_name).get_tensor()
+            if isinstance(var, tuple):
+                tensor.set_recursive_sequence_lengths(var[1])
+                var = var[0]
+            tensor._set_dims(var.shape)
+            tensor.set(var, place)
+        elif isinstance(var, float):
+            scope.find_var(var_name).set_float(var)
+        elif isinstance(var, int):
+            scope.find_var(var_name).set_int(var)
+
+    for in_name, in_dup in Operator.get_op_inputs(op.type()):
+        if in_name in inputs:
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, sub_in_val = item[0], item[1]
+                    __set_input__(sub_in_name, sub_in_val)
+            else:
+                __set_input__(in_name, inputs[in_name])
+
+
+def append_input_output(block, op_proto, np_list, is_input, dtype):
+    '''Insert VarDesc and generate Python variable instance'''
+    proto_list = op_proto.inputs if is_input else op_proto.outputs
+
+    def create_var(block, name, np_list, var_proto):
+        dtype = None
+        shape = None
+        lod_level = None
+        if name not in np_list:
+            assert var_proto.intermediate, "{} not found".format(name)
+        else:
+            # inferece the dtype from numpy value.
+            np_value = np_list[name]
+            if isinstance(np_value, tuple):
+                dtype = np_value[0].dtype
+                # output shape, lod should be infered from input.
+                if is_input:
+                    shape = list(np_value[0].shape)
+                    lod_level = len(np_value[1])
+            else:
+                dtype = np_value.dtype
+                if is_input:
+                    shape = list(np_value.shape)
+                    lod_level = 0
+        return block.create_var(
+            dtype=dtype, shape=shape, lod_level=lod_level, name=name)
+
+    var_dict = {}
+    for var_proto in proto_list:
+        var_name = str(var_proto.name)
+        if (var_name not in np_list) and var_proto.dispensable:
+            continue
+        if is_input:
+            assert (var_name in np_list) or (var_proto.dispensable), \
+                "Missing {} as input".format(var_name)
+        if var_proto.duplicable:
+            assert isinstance(np_list[var_name], list), \
+                "Duplicable {} should be set as list".format(var_name)
+            var_list = []
+            for (name, np_value) in np_list[var_name]:
+                var_list.append(
+                    create_var(block, name, {name: np_value}, var_proto))
+            var_dict[var_name] = var_list
+        else:
+            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
+
+    return var_dict
+
+
+def append_loss_ops(block, output_names):
+    mean_inputs = list(map(block.var, output_names))
+
+    if len(mean_inputs) == 1:
+        loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[2, 1])
+        op = block.append_op(
+            inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mpc_mean')
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+    else:
+        avg_sum = []
+        for cur_loss in mean_inputs:
+            cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
+            op = block.append_op(
+                inputs={"X": [cur_loss]},
+                outputs={"Out": [cur_avg_loss]},
+                type="mean")
+            op.desc.infer_var_type(block.desc)
+            op.desc.infer_shape(block.desc)
+            avg_sum.append(cur_avg_loss)
+
+        loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
+        op_sum = block.append_op(
+            inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
+        op_sum.desc.infer_var_type(block.desc)
+        op_sum.desc.infer_shape(block.desc)
+
+        loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
+        op_loss = block.append_op(
+            inputs={"X": loss_sum},
+            outputs={"Out": loss},
+            type='scale',
+            attrs={'scale': 1.0 / float(len(avg_sum))})
+        op_loss.desc.infer_var_type(block.desc)
+        op_loss.desc.infer_shape(block.desc)
+    return loss
--- a/python/paddle_fl/paddle_fl/examples/ctr_demo/fl_trainer.py
+++ b/python/paddle_fl/paddle_fl/examples/ctr_demo/fl_trainer.py
@@ -44,11 +44,13 @@ trainer = FLTrainerFactory().create_fl_trainer(job)
 trainer._current_ep = "127.0.0.1:{}".format(9000 + trainer_id)
 place = fluid.CPUPlace()
 trainer.start(place)
-print(trainer._scheduler_ep, trainer._current_ep)
+print("scheduler_ep is {}, current_ep is {}".format(trainer._scheduler_ep, trainer._current_ep))
 output_folder = "fl_model"
 epoch_id = 0
 while not trainer.stop():
-    print("batch %d start train" % (epoch_id))
+    if epoch_id > 15:
+        break
+    print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
    train_step = 0
    for data in reader():
        trainer.run(feed=data, fetch=[])

--- a/python/paddle_fl/paddle_fl/examples/ctr_demo/run.sh
+++ b/python/paddle_fl/paddle_fl/examples/ctr_demo/run.sh
+#!/bin/bash
 unset http_proxy
 unset https_proxy
-python fl_master.py
+ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
+
+log_dir=${1:-"logs"}
+mkdir -p ${log_dir}
+
+python fl_master.py > ${log_dir}/master.log 2>&1 &
 sleep 2
-python -u fl_scheduler.py > scheduler.log &
+python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
 sleep 5
-python -u fl_server.py >server0.log &
+python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
 sleep 2
 for ((i=0;i<2;i++))
 do 
-    python -u fl_trainer.py $i >trainer$i.log &
+    python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
    sleep 2
 done
 	
--- a/python/paddle_fl/paddle_fl/examples/dpsgd_demo/fl_trainer.py
+++ b/python/paddle_fl/paddle_fl/examples/dpsgd_demo/fl_trainer.py
@@ -20,6 +20,7 @@ import paddle
 import paddle.fluid as fluid
 import logging
 import math
+import time

 logging.basicConfig(
    filename="test.log",
@@ -72,9 +73,9 @@ epoch_id = 0
 step = 0
 while not trainer.stop():
    epoch_id += 1
-    if epoch_id > 40:
+    if epoch_id > 10:
        break
-    print("epoch %d start train" % (epoch_id))
+    print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
    for step_id, data in enumerate(train_reader()):
        acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"])
        step += 1

--- a/python/paddle_fl/paddle_fl/examples/dpsgd_demo/run.sh
+++ b/python/paddle_fl/paddle_fl/examples/dpsgd_demo/run.sh
+#!/bin/bash
 unset http_proxy
 unset https_proxy
-python fl_master.py
+ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
+
+log_dir=${1:-"logs"}
+mkdir -p ${log_dir}
+
+python fl_master.py > ${log_dir}/master.log 2>&1 &
 sleep 2
-python -u fl_scheduler.py >scheduler.log &
+python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
+sleep 5
+python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
 sleep 2
-python -u fl_server.py >server0.log &
-sleep 2
-python -u fl_trainer.py 0 >trainer0.log &
-sleep 2
-python -u fl_trainer.py 1 >trainer1.log &
-sleep 2
-python -u fl_trainer.py 2 >trainer2.log &
-sleep 2
-python -u fl_trainer.py 3 >trainer3.log &
+for ((i=0;i<4;i++))
+do
+    python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
+    sleep 2
+done
--- a/python/paddle_fl/paddle_fl/examples/femnist_demo/fl_trainer.py
+++ b/python/paddle_fl/paddle_fl/examples/femnist_demo/fl_trainer.py
@@ -21,6 +21,7 @@ import paddle
 import paddle.fluid as fluid
 import logging
 import math
+import time

 logging.basicConfig(
    filename="test.log",
@@ -60,7 +61,7 @@ def train_test(train_test_program, train_test_feed, train_test_reader):

 epoch_id = 0
 step = 0
-epoch = 3000
+epoch = 10
 count_by_step = False
 if count_by_step:
    output_folder = "model_node%d" % trainer_id
@@ -72,7 +73,7 @@ while not trainer.stop():
    epoch_id += 1
    if epoch_id > epoch:
        break
-    print("epoch %d start train" % (epoch_id))
+    print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
    #train_data,test_data= data_generater(trainer_id,inner_step=trainer._step,batch_size=64,count_by_step=count_by_step)
    train_reader = paddle.batch(
        paddle.reader.shuffle(
@@ -97,7 +98,6 @@ while not trainer.stop():
            acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"])
            step += 1
            count += 1
-            print(count)
            if count % trainer._step == 0:
                break
    # print("acc:%.3f" % (acc[0]))

--- a/python/paddle_fl/paddle_fl/examples/femnist_demo/run.sh
+++ b/python/paddle_fl/paddle_fl/examples/femnist_demo/run.sh
+#!/bin/bash
 unset http_proxy
 unset https_proxy
-#killall python
-python fl_master.py
+ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
+
+log_dir=${1:-"logs"}
+mkdir -p ${log_dir}
+
+python fl_master.py > ${log_dir}/master.log 2>&1 &
 sleep 2
-python -u fl_scheduler.py >scheduler.log &
-sleep 2
-python -u fl_server.py >server0.log &
+python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
+sleep 5
+python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
 sleep 2
 for ((i=0;i<4;i++))
 do
-    python -u fl_trainer.py $i >trainer$i.log &
+    python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
    sleep 2
 done
--- a/python/paddle_fl/paddle_fl/examples/generate_job_from_program/README.md
+++ b/python/paddle_fl/paddle_fl/examples/generate_job_from_program/README.md
@@ -17,6 +17,10 @@ pip install paddle_fl

 #### How to save a program

+```sh
+python program_saver.py
+```
+
 In program_saver.py, you can defind a model. And save the program in to 'load_file'

 ```python

--- a/python/paddle_fl/paddle_fl/examples/generate_job_from_program/fl_trainer.py
+++ b/python/paddle_fl/paddle_fl/examples/generate_job_from_program/fl_trainer.py
@@ -20,6 +20,7 @@ import paddle
 import paddle.fluid as fluid
 import logging
 import math
+import time

 logging.basicConfig(
    filename="test.log",
@@ -67,9 +68,9 @@ epoch_id = 0
 step = 0
 while not trainer.stop():
    epoch_id += 1
-    if epoch_id > 40:
+    if epoch_id > 10:
        break
-    print("epoch %d start train" % (epoch_id))
+    print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
    for step_id, data in enumerate(train_reader()):
        acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"])
        step += 1

--- a/python/paddle_fl/paddle_fl/examples/generate_job_from_program/run.sh
+++ b/python/paddle_fl/paddle_fl/examples/generate_job_from_program/run.sh
+#!/bin/bash
 unset http_proxy
 unset https_proxy
-python program_saver.py
+ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
+if [ ! -d load_file ]; then
+    python program_saver.py
+fi

-python fl_master.py
-sleep 2
-python -u fl_scheduler.py >scheduler.log &
-sleep 2
-python -u fl_server.py >server0.log &
-sleep 2
-python -u fl_trainer.py 0 >trainer0.log &
+log_dir=${1:-"logs"}
+mkdir -p ${log_dir}
+
+python fl_master.py > ${log_dir}/master.log 2>&1 &
 sleep 2
-python -u fl_trainer.py 1 > trainer1.log &
+python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
+sleep 5
+python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
 sleep 2
+for ((i=0;i<2;i++))
+do
+    python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
+    sleep 2
+done
--- a/python/paddle_fl/paddle_fl/examples/gru4rec_demo/fl_trainer.py
+++ b/python/paddle_fl/paddle_fl/examples/gru4rec_demo/fl_trainer.py
@@ -20,6 +20,8 @@ import numpy as np
 import sys
 import os
 import logging
+import time
+
 logging.basicConfig(
    filename="test.log",
    filemode="w",
@@ -43,10 +45,9 @@ r = Gru4rec_Reader()
 train_reader = r.reader(train_file_dir, place, batch_size=125)

 output_folder = "model_node4"
-step_i = 0
+epoch_i = 0
 while not trainer.stop():
-    step_i += 1
-    print("batch %d start train" % (step_i))
+    epoch_i += 1
    train_step = 0
    for data in train_reader():
        #print(np.array(data['src_wordseq']))
@@ -56,10 +57,10 @@ while not trainer.stop():
            break
        avg_ppl = np.exp(ret_avg_cost[0])
        newest_ppl = np.mean(avg_ppl)
-        print("ppl:%.3f" % (newest_ppl))
-    save_dir = (output_folder + "/epoch_%d") % step_i
+        print("{} Epoch {} start train, train_step {}, ppl {}".format (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_i, train_step, newest_ppl))
+    save_dir = (output_folder + "/epoch_%d") % epoch_i
    if trainer_id == 0:
        print("start save")
        trainer.save_inference_program(save_dir)
-    if step_i >= 40:
+    if epoch_i >= 5:
        break
--- a/python/paddle_fl/paddle_fl/examples/gru4rec_demo/run.sh
+++ b/python/paddle_fl/paddle_fl/examples/gru4rec_demo/run.sh
+#!/bin/bash
 unset http_proxy
 unset https_proxy
-python fl_master.py
+ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
+
+if [ ! -d mid_data ];then
+    sh download.sh
+fi
+
+log_dir=${1:-"logs"}
+mkdir -p ${log_dir}
+
+python fl_master.py > ${log_dir}/master.log 2>&1 &
 sleep 2
-python -u fl_scheduler.py >scheduler.log &
-python -u fl_server.py >server0.log &
+python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
+sleep 5
+python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
 sleep 2
-python -u fl_trainer.py 0 >trainer0.log &
-sleep 2
-python -u fl_trainer.py 1 >trainer1.log &
-sleep 2
-python -u fl_trainer.py 2 >trainer2.log &
-sleep 2
-python -u fl_trainer.py 3 >trainer3.log &
+for ((i=0;i<4;i++))
+do
+    python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
+    sleep 2
+done
--- a/python/paddle_fl/paddle_fl/examples/secagg_demo/fl_trainer.py
+++ b/python/paddle_fl/paddle_fl/examples/secagg_demo/fl_trainer.py
@@ -84,21 +84,16 @@ def train_test(train_test_program, train_test_feed, train_test_reader):


 # for test
-
 while not trainer.stop():
    epoch_id += 1
-    print("epoch %d start train" % (epoch_id))
-
    for data in train_reader():
        step_i += 1
        trainer.step_id = step_i
        accuracy, = trainer.run(feed=feeder.feed(data),
                                fetch=["accuracy_0.tmp_0"])
        if step_i % 100 == 0:
-            print("Epoch: {0}, step: {1}, accuracy: {2}".format(
+            print("{} Epoch {} start train, step: {}, accuracy: {}".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), 
                epoch_id, step_i, accuracy[0]))
-
-    print(step_i)
    avg_loss_val, acc_val = train_test(
        train_test_program=test_program,
        train_test_reader=test_reader,
@@ -106,7 +101,7 @@ while not trainer.stop():
    print("Test with Epoch %d, avg_cost: %s, acc: %s" %
          (epoch_id, avg_loss_val, acc_val))

-    if epoch_id > 40:
+    if epoch_id > 5:
        break
    if epoch_id % 5 == 0:
        trainer.save_inference_program(output_folder)
--- a/python/paddle_fl/paddle_fl/examples/secagg_demo/run.sh
+++ b/python/paddle_fl/paddle_fl/examples/secagg_demo/run.sh
+#!/bin/bash
 unset http_proxy
 unset https_proxy
+ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9

-if [ ! -d log ];then
-    mkdir log
-fi
+log_dir=${1:-"logs"}
+mkdir -p ${log_dir}

-python fl_master.py
+python fl_master.py > ${log_dir}/master.log 2>&1 &
 sleep 2
-python -u fl_server.py >log/server0.log &
+python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
+sleep 5
+python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
 sleep 2
-python -u fl_scheduler.py > log/scheduler.log &
-sleep 2
-python -u fl_trainer.py 0 >log/trainer0.log &
-sleep 2
-python -u fl_trainer.py 1 >log/trainer1.log &
+for ((i=0;i<2;i++))
+do
+    python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
+    sleep 2
+done