diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23bb27e77b9eab0c322a71a8ff570d12d1050377..db3c3b8e2069f9ae5ad02286b59decf8fe764c2d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,11 @@ if(ANDROID OR IOS)
     add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
 
+if (APPLE OR WIN32)
+    set(WITH_MKL OFF CACHE STRING
+        "Disable MKL for building on mac and windows" FORCE)
+endif()
+
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 85f40585da29bab9a107f5546e64870975f4c2d3..82437a84248fece843c3659c9422d9b579b5066f 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -50,6 +50,7 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
     BUILD_IN_SOURCE 1
+    PATCH_COMMAND git apply ${PADDLE_SOURCE_DIR}/patches/grpc/fix_too_early_destory.patch
     # NOTE(yuyang18):
     # Disable -Werror, otherwise the compile will fail in MacOS.
     # It seems that we cannot configure that by make command.
diff --git a/doc/fluid/design/quantization/fixed_point_quantization.md b/doc/fluid/design/quantization/fixed_point_quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..085352fc5614d693e63a2f7241e868a9649456af
--- /dev/null
+++ b/doc/fluid/design/quantization/fixed_point_quantization.md
@@ -0,0 +1,110 @@
+Fixed-point quantization uses lower bits, for example, 2-bit, 3-bit or 8-bit fixed point to represent weights and activations, which usually are in singe-precision float-point with 32 bits. The fixed-point representation has advantages in reducing memory bandwidth, lowering power consumption and computational resources as well as the model storage requirements.  It is especially important for the inference in embedded-device deployment.
+
+According to some experiments, the apporach to quantize the model trained in float point directly works effectively on the large models, like the VGG model having many parameters. But the accuracy drops a lot for the small model. In order to improve the tradeoff between accuracy and latency, many quantized training apporaches are proposed.
+
+This document is to design a quantized training framework on Fluid. The first part will introduce how to quantize, The second part will describe the quantized training framework. The last part will illustrate how to calculate the quantization scale.
+
+
+### How to quantize
+
+There are many ways to quantize the float value to fixed-point value. For example:
+
+$$ r = min(max(x, a), b)$$
+$$ s = \frac{b - a}{n - 1} $$
+$$ q = \left \lfloor \frac{r - a}{s} \right \rceil $$
+
+where, $x$ is the float value to be quantized, $[a, b]$ is the quantization range, $a$ is the minimum value and $b$ is the maximal value. $\left \lfloor \right \rceil$  denotes rounding to the nearest integer. If the quantization level is $k$, $n$ is $2^k$, for example, $k$ is 8 and $n$ is 256. $q$ is the quantized integer. 
+
+
+The quantization we applied is parameterized by the number of quantization levels and maximum absolute value:
+
+$$ M  = max(abs(x))  $$
+$$ q = \left \lfloor \frac{x}{M} * (n - 1) \right \rceil $$
+
+where, $x$ is the float value to be quantized, $M$ is maximum absolute value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer.  For 8 bit quantization, $n=2^{8}=256$. $q$ is the quantized integer. 
+
+
+Wether the *min-max* quantization or *max-abs* quantization, they also can be represent:
+
+$q = scale * r + b$
+
+We call *min-max*, *max-abs* as the quantization arguments, also call them quantization scale or quantization range.
+
+
+How to calculate the quantization scale (or maximum absolute value) for inference will be described in the last part.
+
+
+### Training Framework
+
+#### Forward pass
+
+The forward pass is simulated quantization, see Figure 1.
+
+The training framework is as following figure. 
+
+<p align="center"> 
+<img src="quantization_forward.png" width="300" height="340"><br/>
+Figure 1. Forward in training with simulated quantization.
+</p>
+
+- Firstly, both input and weight will be quantized to 8-bit integers. 
+- Second, do the multiplication (or convolution) operation with integers.
+- Third, dequantize the multiplication (or convolution) results to 32-bit float point.
+- Finally, do bias-addition in float type of 32 bit. Here, the bias is not quantized.
+
+For general matrix multiplication (GEMM), quantize for $X$ and $W$:
+
+$$ X_q = \left \lfloor \frac{X}{X_m} * (n - 1) \right \rceil  $$
+$$ W_q = \left \lfloor \frac{W}{W_m} * (n - 1) \right \rceil $$
+
+Do GEMM:
+
+$$ Y = X_q * W_q $$
+
+
+Dequantize $Y$:
+
+$$
+\begin{align}
+Y_{dq} &=\frac{Y}{(n - 1) * (n - 1)} * X_m * W_m \\\
+       &=\frac{X_q * W_q}{(n - 1) * (n - 1)} * X_m * W_m \\\
+       &=(\frac{X_q}{n - 1} * X_m) * (\frac{W_q}{n - 1} * W_m) 
+\end{align}
+$$
+
+From these formulas, dequantization also can be moved before GEMM, do dequantization for $Xq$ and $Wq$ at first, then do GEMM. The forward workflow in training is equivalent to following framework.
+
+<p align="center"> 
+<img src="quantization_equivalent_forward.png"  width="300" height="330"><br/>
+Figure 2. Equivalent forward in training with simulated quantization.
+</p>
+
+We use this equivalent workflow in the training. In our desigin, there is a quantization transpiler to insert the quantization operator and the de-quantization operator in the Fluid `ProgramDesc`. Since the outputs of quantization and de-quantization operator are still in floating point, they are called faked quantization and de-quantization operator. And the training framework is called simulated quantization.
+
+#### Backward pass
+
+See Figure 3. The gradients are calculated by dequantized weights and activations. All inputs and outputs are float point with 32-bit. And in the weight updating process, the gradients will be added to the original weight, not the quantized or dequantized weights.
+
+<p align="center"> 
+<img src="quantization_backward_and_optimization.png"><br/>
+Figure 3. Backward and weight updating in training with simulated quantization.
+</p>
+
+So the quantization transipler will change some inputs of the corresponding backward operators. 
+
+### How to calculate quantization scale
+
+There are two strategies to calculate quantization scale, we call them dynamic and static strategy. The dynamic strategy calculates the quantization scale value each iteration. The static strategy keeps the quantization scale for different inputs.
+
+For weights, we apply the dynamic strategy in the training, that is to say, the quantization scale will be recalculated during each iteration until the traning is finished.
+
+For activations, the quantization scales are estimated during training, then used in inference. There are several different ways to estimate them:
+
+
+1. Calculate the mean of maximum absolute during a window.
+2. Calculate the max of maximum absolute during a window.
+3. Calculate the running mean of maximum absolute during a window, as follows:
+
+    $$ Vt = (1 - k) * V +  k * V_{t-1}  $$
+    
+    where, $V$ is the maximum absolute value of current batch, $Vt$ is the running mean value. $k$ is a factor, such as 0.9.
diff --git a/doc/fluid/design/quantization/quantization_backward_and_optimization.png b/doc/fluid/design/quantization/quantization_backward_and_optimization.png
new file mode 100644
index 0000000000000000000000000000000000000000..84f8235ab87cb631992b691f8e05b9c0b6c93da2
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_backward_and_optimization.png differ
diff --git a/doc/fluid/design/quantization/quantization_equivalent_forward.png b/doc/fluid/design/quantization/quantization_equivalent_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..df49c864537c047c785da12d24893e54ce0a5341
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_equivalent_forward.png differ
diff --git a/doc/fluid/design/quantization/quantization_forward.png b/doc/fluid/design/quantization/quantization_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..0913f61621bb6533bcb10bd1d18120ccaaa96cff
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_forward.png differ
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index c30eff5010748685838feb984c9c817ffcf14c11..98c2f68a6c39ed12795bad4a905558917c0275a4 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -49,7 +49,9 @@ cc_library(paddle_inference_api
 # Here the shared library doesn't depend on other fluid libraries, or double free will occur.
 cc_library(paddle_inference_api_shared SHARED
     SRCS paddle_inference_api.cc paddle_inference_api_impl.cc)
+add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)
+
 if(NOT APPLE)
   set(LINK_FLAGS "-fPIC -fvisibility=hidden")
   set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 6d296ff7bf14de9175dc589dfa8b46c534127ca1..a44d84cd7b99107fef09a6b4dfa60172fabd718b 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -27,7 +27,8 @@ anchor_generator_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
 target_assign_op.cu)
 detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
-    polygon_box_transform_op.cu)
+polygon_box_transform_op.cu)
+detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 
 # Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b0c9b2886504ee381b2b33e06a4552602725e57
--- /dev/null
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -0,0 +1,282 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class RpnTargetAssignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
+                   "Input(DistMat) of RpnTargetAssignOp should not be null");
+
+    PADDLE_ENFORCE(
+        ctx->HasOutput("LocationIndex"),
+        "Output(LocationIndex) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ScoreIndex"),
+        "Output(ScoreIndex) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("TargetLabel"),
+        "Output(TargetLabel) of RpnTargetAssignOp should not be null");
+
+    auto in_dims = ctx->GetInputDim("DistMat");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
+                      "The rank of Input(DistMat) must be 2.");
+  }
+};
+
+template <typename T>
+class RpnTargetAssignKernel : public framework::OpKernel<T> {
+ public:
+  void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
+                   const int row, const int col, const float pos_threshold,
+                   const float neg_threshold, int64_t* target_label_data,
+                   std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
+    int fg_offset = fg_inds->size();
+    int bg_offset = bg_inds->size();
+    for (int64_t i = 0; i < row; ++i) {
+      const T* v = dist_data + i * col;
+      T max_dist = *std::max_element(v, v + col);
+      for (int64_t j = 0; j < col; ++j) {
+        T val = dist_data[i * col + j];
+        if (val == max_dist) target_label_data[j] = 1;
+      }
+    }
+
+    // Pick the fg/bg and count the number
+    for (int64_t j = 0; j < col; ++j) {
+      if (anchor_to_gt_max.data<T>()[j] > pos_threshold) {
+        target_label_data[j] = 1;
+      } else if (anchor_to_gt_max.data<T>()[j] < neg_threshold) {
+        target_label_data[j] = 0;
+      }
+      if (target_label_data[j] == 1) {
+        fg_inds->push_back(fg_offset + j);
+      } else if (target_label_data[j] == 0) {
+        bg_inds->push_back(bg_offset + j);
+      }
+    }
+  }
+
+  void ReservoirSampling(const int num, const int offset,
+                         std::minstd_rand engine,
+                         std::vector<int>* inds) const {
+    std::uniform_real_distribution<float> uniform(0, 1);
+    if (inds->size() > num) {
+      for (int i = num; i < inds->size(); ++i) {
+        int rng_ind = std::floor(uniform(engine) * i);
+        if (rng_ind < num)
+          std::iter_swap(inds->begin() + rng_ind + offset,
+                         inds->begin() + i + offset);
+      }
+    }
+  }
+
+  void RpnTargetAssign(const framework::ExecutionContext& ctx,
+                       const Tensor& dist, const float pos_threshold,
+                       const float neg_threshold, const int rpn_batch_size,
+                       const int fg_num, std::minstd_rand engine,
+                       std::vector<int>* fg_inds, std::vector<int>* bg_inds,
+                       int64_t* target_label_data) const {
+    auto* dist_data = dist.data<T>();
+    int64_t row = dist.dims()[0];
+    int64_t col = dist.dims()[1];
+    int fg_offset = fg_inds->size();
+    int bg_offset = bg_inds->size();
+
+    // Calculate the max IoU between anchors and gt boxes
+    Tensor anchor_to_gt_max;
+    anchor_to_gt_max.mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(col), 1}),
+        platform::CPUPlace());
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    auto x = EigenMatrix<T>::From(dist);
+    auto x_col_max = EigenMatrix<T>::From(anchor_to_gt_max);
+    x_col_max.device(place) =
+        x.maximum(Eigen::DSizes<int, 1>(0))
+            .reshape(Eigen::DSizes<int, 2>(static_cast<int64_t>(col), 1));
+    // Follow the Faster RCNN's implementation
+    ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
+                neg_threshold, target_label_data, fg_inds, bg_inds);
+    // Reservoir Sampling
+    ReservoirSampling(fg_num, fg_offset, engine, fg_inds);
+    int bg_num = rpn_batch_size - fg_inds->size();
+    ReservoirSampling(bg_num, bg_offset, engine, bg_inds);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dist = context.Input<LoDTensor>("DistMat");
+    auto* loc_index = context.Output<Tensor>("LocationIndex");
+    auto* score_index = context.Output<Tensor>("ScoreIndex");
+    auto* tgt_lbl = context.Output<Tensor>("TargetLabel");
+
+    auto col = dist->dims()[1];
+    int64_t n = dist->lod().size() == 0UL
+                    ? 1
+                    : static_cast<int64_t>(dist->lod().back().size() - 1);
+    if (dist->lod().size()) {
+      PADDLE_ENFORCE_EQ(dist->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
+    float pos_threshold = context.Attr<float>("rpn_positive_overlap");
+    float neg_threshold = context.Attr<float>("rpn_negative_overlap");
+    float fg_fraction = context.Attr<float>("fg_fraction");
+
+    int fg_num = static_cast<int>(rpn_batch_size * fg_fraction);
+
+    int64_t* target_label_data =
+        tgt_lbl->mutable_data<int64_t>({n * col, 1}, context.GetPlace());
+
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
+    iset(dev_ctx, tgt_lbl, static_cast<int>(-1));
+
+    std::vector<int> fg_inds;
+    std::vector<int> bg_inds;
+    std::random_device rnd;
+    std::minstd_rand engine;
+    int seed =
+        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+    engine.seed(seed);
+
+    if (n == 1) {
+      RpnTargetAssign(context, *dist, pos_threshold, neg_threshold,
+                      rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
+                      target_label_data);
+    } else {
+      auto lod = dist->lod().back();
+      for (size_t i = 0; i < lod.size() - 1; ++i) {
+        Tensor one_ins = dist->Slice(lod[i], lod[i + 1]);
+        RpnTargetAssign(context, one_ins, pos_threshold, neg_threshold,
+                        rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
+                        target_label_data + i * col);
+      }
+    }
+    int* loc_index_data = loc_index->mutable_data<int>(
+        {static_cast<int>(fg_inds.size())}, context.GetPlace());
+    int* score_index_data = score_index->mutable_data<int>(
+        {static_cast<int>(fg_inds.size() + bg_inds.size())},
+        context.GetPlace());
+    memcpy(loc_index_data, reinterpret_cast<int*>(&fg_inds[0]),
+           fg_inds.size() * sizeof(int));
+    memcpy(score_index_data, reinterpret_cast<int*>(&fg_inds[0]),
+           fg_inds.size() * sizeof(int));
+    memcpy(score_index_data + fg_inds.size(),
+           reinterpret_cast<int*>(&bg_inds[0]), bg_inds.size() * sizeof(int));
+  }
+};
+
+class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "DistMat",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[K, M]. It is pair-wise distance matrix between the entities "
+        "represented by each row and each column. For example, assumed one "
+        "entity is A with shape [K], another entity is B with shape [M]. The "
+        "DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
+        "the distance is, the better macthing the pairs are. Please note, "
+        "This tensor can contain LoD information to represent a batch of "
+        "inputs. One instance of this batch can contain different numbers of "
+        "entities.");
+    AddAttr<float>(
+        "rpn_positive_overlap",
+        "Minimum overlap required between an anchor and ground-truth "
+        "box for the (anchor, gt box) pair to be a positive example.")
+        .SetDefault(0.7);
+    AddAttr<float>(
+        "rpn_negative_overlap",
+        "Maximum overlap allowed between an anchor and ground-truth "
+        "box for the (anchor, gt box) pair to be a negative examples.")
+        .SetDefault(0.3);
+    AddAttr<float>(
+        "fg_fraction",
+        "Target fraction of RoI minibatch that "
+        "is labeled foreground (i.e. class > 0), 0-th class is background.")
+        .SetDefault(0.25);
+    AddAttr<int>("rpn_batch_size_per_im",
+                 "Total number of RPN examples per image.")
+        .SetDefault(256);
+    AddAttr<bool>("fix_seed",
+                  "A flag indicating whether to use a fixed seed to generate "
+                  "random mask. NOTE: DO NOT set this flag to true in "
+                  "training. Setting this flag to true is only useful in "
+                  "unittest.")
+        .SetDefault(false);
+    AddAttr<int>("seed", "RpnTargetAssign random seed.").SetDefault(0);
+    AddOutput(
+        "LocationIndex",
+        "(Tensor), The indexes of foreground anchors in all RPN anchors, the "
+        "shape of the LocationIndex is [F], F depends on the value of input "
+        "tensor and attributes.");
+    AddOutput(
+        "ScoreIndex",
+        "(Tensor), The indexes of foreground and background anchors in all "
+        "RPN anchors(The rest anchors are ignored). The shape of the "
+        "ScoreIndex is [F + B], F and B depend on the value of input "
+        "tensor and attributes.");
+    AddOutput("TargetLabel",
+              "(Tensor<int64_t>), The target labels of each anchor with shape "
+              "[K * M, 1], "
+              "K and M is the same as they are in DistMat.");
+    AddComment(R"DOC(
+This operator can be, for given the IoU between the ground truth bboxes and the
+anchors, to assign classification and regression targets to each prediction.
+The Score index and LocationIndex will be generated according to the DistMat.
+The rest anchors would not contibute to the RPN training loss
+
+ScoreIndex is composed of foreground anchor indexes(positive labels) and
+background anchor indexes(negative labels). LocationIndex is exactly same
+as the foreground anchor indexes since we can not assign regression target to 
+the background anchors.
+
+The classification targets(TargetLabel) is a binary class label (of being
+an object or not). Following the paper of Faster-RCNN, the positive labels
+are two kinds of anchors: (i) the anchor/anchors with the highest IoU
+overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap
+higher than rpn_positive_overlap(0.7) with any ground-truth box. Note that
+a single ground-truth box may assign positive labels to multiple anchors.
+A non-positive anchor is when its IoU ratio is lower than rpn_negative_overlap
+(0.3) for all ground-truth boxes. Anchors that are neither positive nor
+negative do not contribute to the training objective.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(rpn_target_assign, ops::RpnTargetAssignOp,
+                  ops::RpnTargetAssignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(rpn_target_assign, ops::RpnTargetAssignKernel<float>,
+                       ops::RpnTargetAssignKernel<double>);
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 4a09f3870d64d8e14b2db41ff3ea7c2f9e67b558..35318a805898de645c844a2224f6df8c458d346c 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -59,7 +59,9 @@ GRPCClient::~GRPCClient() {
     for (auto& it : channels_) {
       it.second.reset();
     }
+    channels_.clear();
   }
+
   client_thread_->join();
 }
 
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a91e0f520e93c01bc5af09b691af2d5a6deda9f2
--- /dev/null
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class FakeQuantizeOp : public framework::OperatorWithKernel {
+ public:
+  FakeQuantizeOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeQuantizeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FakeQuantizeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutMovingScale"),
+                   "OutMovingScale(Out) of FakeQuantizeOp should not be null");
+    // if (ctx->HasInput("InMovingScale")) {
+    ctx->SetOutputDim("OutMovingScale", ctx->GetInputDim("InMovingScale"));
+    //}
+    // if (ctx->HasInput("InScales")) {
+    PADDLE_ENFORCE(ctx->HasOutput("OutScales"),
+                   "OutScales(Out) of FakeQuantizeOp should not be null");
+    ctx->SetOutputDim("OutScales", ctx->GetInputDim("InScales"));
+    // PADDLE_ENFORCE_EQ(ctx->Inputs("InScales")[0],
+    // ctx->Outputs("OutScales")[0],
+    //                  "Mean and MeanOut should share the same memory");
+    //}
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class FakeQuantizeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddInput("InScales", "(Tensor) scale buffer, used in static quantization.")
+        .AsDispensable();
+    AddInput("InMovingScale", "Last scale, used in static quantization.")
+        .AsDispensable();
+    AddInput("InCurrentIter",
+             "Last iteration number, used in static quantization.")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
+    AddOutput("OutScales",
+              "(Tensor) scale buffer, used in static quantization.")
+        .AsDispensable();
+    AddOutput("OutMovingScale", " Current scale");
+    AddOutput("OutCurrentIter", "Current iteration number.").AsDispensable();
+    AddAttr<std::string>("quantize_type",
+                         "(string, default abs_max)"
+                         "The scaling tpe of the quantize operator.")
+        .SetDefault("abs_max");
+    AddAttr<int>("window_size", "(int, default 10000)").SetDefault(10000);
+    AddAttr<int>("bit_length", "(int, default 8)")
+        .SetDefault(8)
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddComment(R"DOC(
+FakeQuantize operator
+
+quantize_type = abs_max:
+
+    $$scale = max(abs(x))$$ 
+
+quantize_type = range_abs_max:
+
+    $$scale = max(max(abs(x)), history_abs_max)$$ 
+
+quantize_type = moving_average_abs_max:
+
+    $$scale = 0.1*scale+0.9*new_abs_max)$$ 
+
+$$Out = scale*X$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(fake_quantize, ops::FakeQuantizeOp, ops::FakeQuantizeOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fake_quantize,
+    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..be0c6730a5119090600a27c66510b2a095c54583
--- /dev/null
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -0,0 +1,272 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
+  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x;
+
+  extern __shared__ T shared_max_data[];
+  if (gridDim.x > 1) {
+    shared_max_data[tid] = T(0);
+    for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
+      T tmp = fabs(in[i]);
+      if (tmp > shared_max_data[tid]) {
+        shared_max_data[tid] = tmp;
+      }
+    }
+  } else {
+    if (bid < n) {
+      shared_max_data[tid] = fabs(in[bid]);
+    } else {
+      shared_max_data[tid] = T(0);
+    }
+  }
+  __syncthreads();
+
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i && shared_max_data[tid] < shared_max_data[tid + i]) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[blockIdx.x] = shared_max_data[0];
+  }
+}
+
+float FindAbsMaxGpu(const platform::CUDADeviceContext& ctx, const float* array,
+                    int length) {
+  float host_max;
+  int kNumTheads = 1024;
+  int gridDimx = (kNumTheads - 1 + length) / kNumTheads;
+  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
+  framework::Tensor t;
+  float* device_max = t.mutable_data<float>(framework::make_ddim({gridDimx}),
+                                            platform::CUDAPlace());
+  FindAbsMaxKernel<float><<<gridDimx, kNumTheads, kNumTheads * sizeof(float),
+                            ctx.stream()>>>(length, array, device_max);
+  FindAbsMaxKernel<
+      float><<<1, kNumTheads, kNumTheads * sizeof(float), ctx.stream()>>>(
+      gridDimx, device_max, device_max);
+  PADDLE_ENFORCE_EQ(
+      cudaMemcpy(&host_max, device_max, sizeof(float), cudaMemcpyDeviceToHost),
+      cudaSuccess, "cudaMemcpy failed");
+  return host_max;
+}
+
+template <typename T>
+__global__ void ApplySaturateKernel(const int n, const T* in, T* out,
+                                    int* num_saturate, const T min,
+                                    const T max) {
+  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x;
+
+  extern __shared__ int shared_count[];
+  shared_count[tid] = 0;
+  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
+    if (in[i] > max) {
+      out[i] = max;
+      shared_count[tid] += 1;
+    } else if (in[i] < min) {
+      out[i] = min;
+      shared_count[tid] += 1;
+    } else {
+      out[i] = in[i];
+    }
+  }
+  __syncthreads();
+
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i) {
+      shared_count[tid] += shared_count[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    num_saturate[blockIdx.x] = shared_count[0];
+  }
+}
+
+template <typename T>
+__global__ void ReduceKernel(const int n, const T* in, T* out) {
+  int tid = threadIdx.x;
+  extern __shared__ T shared_sum[];
+  if (tid < n) {
+    shared_sum[tid] = in[tid];
+  } else {
+    shared_sum[tid] = T(0);
+  }
+  __syncthreads();
+  // blockDim.x must >= n
+  for (int i = (n + 1) / 2; i > 0; i >>= 1) {
+    if (tid < i) {
+      shared_sum[tid] += shared_sum[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[0] = shared_sum[0];
+  }
+}
+
+template <typename T>
+int ApplySaturateGpu(const platform::CUDADeviceContext& ctx, const int n,
+                     const T* in, T* out, const T min, const T max) {
+  int host_num_saturate;
+  int kNumTheads = 1024;
+  int gridDimx = (n + kNumTheads - 1) / kNumTheads;
+  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
+  framework::Tensor t;
+  int* device_num_saturate = t.mutable_data<int>(
+      framework::make_ddim({gridDimx}), platform::CUDAPlace());
+  ApplySaturateKernel<
+      T><<<gridDimx, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
+      n, in, out, device_num_saturate, min, max);
+  ReduceKernel<int><<<1, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
+      gridDimx, device_num_saturate, device_num_saturate);
+  PADDLE_ENFORCE_EQ(cudaSuccess,
+                    cudaMemcpy(&host_num_saturate, device_num_saturate,
+                               sizeof(int), cudaMemcpyDeviceToHost),
+                    "cudaMemcpy failed");
+  return host_num_saturate;
+}
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeCUDAKernel : public framework::OpKernel<T> {
+ public:
+  T FindRangeAbsMax(const platform::CUDADeviceContext& ctx,
+                    framework::Tensor* scale_list, framework::Tensor* out_scale,
+                    const T& cur_scale, int window_size,
+                    int current_iter) const {
+    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
+    T remove_tmp = sl[current_iter];
+    sl[current_iter] = cur_scale;
+    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
+    if (max_scale < cur_scale) {
+      max_scale = cur_scale;
+    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
+      int size = (current_iter > window_size) ? window_size : current_iter;
+      max_scale = T(FindAbsMaxGpu(ctx, scale_list->data<float>(), size));
+    }
+    return max_scale;
+  }
+
+  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
+                             framework::Tensor* out_scale,
+                             const T& cur_scale) const {
+    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
+    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
+    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
+    return T(outs[0]);
+  }
+
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto& device_ctx = context.cuda_device_context();
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    const bool is_test = context.Attr<bool>("is_test");
+    tensor->mutable_data<T>(in->place());
+    context.Output<framework::Tensor>("OutMovingScale")
+        ->mutable_data<T>(
+            context.Input<framework::Tensor>("InMovingScale")->place());
+    auto quantize_type =
+        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
+    if (quantize_type == std::string("range_abs_max")) {
+      context.Output<framework::Tensor>("OutScales")
+          ->mutable_data<T>(
+              context.Input<framework::Tensor>("InScales")->place());
+      context.Output<framework::Tensor>("OutCurrentIter")
+          ->mutable_data<T>(
+              context.Input<framework::Tensor>("InCurrentIter")->place());
+    }
+
+    T scale = T(1);
+    int window_size = context.Attr<int>("window_size");
+    T bin_cnt = (T)((1 << (context.Attr<int>("bit_length") - 1)) - 1);
+    if (quantize_type == std::string("abs_max")) {
+      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
+      scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
+      saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
+
+      auto& device_ctx = context.template device_context<DeviceContext>();
+      auto* scale_list = context.Output<framework::Tensor>("OutScales");
+      math::SetConstant<DeviceContext, T> scalar;
+      scale_list->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, scale_list, static_cast<T>(0));
+      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+      iter->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, iter, static_cast<T>(0));
+    } else if (quantize_type == std::string("range_abs_max")) {
+      auto* moving_scale = const_cast<framework::Tensor*>(
+          context.Input<framework::Tensor>("InMovingScale"));
+      if (is_test) {
+        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
+      } else {
+        auto* it = const_cast<framework::Tensor*>(
+            context.Input<framework::Tensor>("InCurrentIter"));
+        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+        int* last_iter = it->mutable_data<int>(platform::CPUPlace());
+        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
+        auto* scale_list = context.Output<framework::Tensor>("OutScales");
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
+        scale = FindRangeAbsMax(device_ctx, scale_list, saving_scale, scale,
+                                window_size, current_iter[0]);
+        (*current_iter) = (*last_iter) + 1;
+      }
+    } else if (quantize_type == std::string("moving_average_abs_max")) {
+      auto* moving_scale = const_cast<framework::Tensor*>(
+          context.Input<framework::Tensor>("InMovingScale"));
+      if (is_test) {
+        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
+      } else {
+        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        scale = FindMovingAverageAbsMmax(
+            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
+      }
+    }
+
+    ApplySaturateGpu<T>(device_ctx, in->numel(), in->data<T>(),
+                        tensor->mutable_data<T>(in->place()), -scale, scale);
+    scale = bin_cnt / scale;
+
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
+    eigen_out.device(dev) = (scale * eigen_in).round();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(fake_quantize,
+                        paddle::operators::FakeQuantizeCUDAKernel<
+                            paddle::platform::CUDADeviceContext, float>,
+                        paddle::operators::FakeQuantizeCUDAKernel<
+                            paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..80f71d85dde39f773cc489fb79effcc775c5010a
--- /dev/null
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::Transform;
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeKernel : public framework::OpKernel<T> {
+ public:
+  T FindAbsMax(framework::Tensor* in, int n) const {
+    T* p = in->mutable_data<T>(platform::CPUPlace());
+    T abs_max = (T)0.00000001;
+    for (int i = 0; i < n; i++) {
+      T tmp = fabs(p[i]);
+      if (tmp > abs_max) abs_max = tmp;
+    }
+    return T(abs_max);
+  }
+  T FindRangeAbsMax(framework::Tensor* scale_list, framework::Tensor* out_scale,
+                    const T& cur_scale, int window_size,
+                    int current_iter) const {
+    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
+    T remove_tmp = sl[current_iter];
+    sl[current_iter] = cur_scale;
+    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
+    if (max_scale < cur_scale) {
+      max_scale = cur_scale;
+    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
+      int size = (current_iter > window_size) ? window_size : current_iter;
+      max_scale = T(FindAbsMax(scale_list, size));
+    }
+    return max_scale;
+  }
+
+  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
+                             framework::Tensor* out_scale,
+                             const T& cur_scale) const {
+    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
+    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
+    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
+    return T(outs[0]);
+  }
+
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    const bool is_test = context.Attr<bool>("is_test");
+    tensor->mutable_data<T>(in->place());
+
+    auto* oms_tensor = context.Output<framework::Tensor>("OutMovingScale");
+    oms_tensor->mutable_data<T>(in->place());
+
+    auto quantize_type =
+        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
+    if (quantize_type == std::string("range_abs_max")) {
+      auto* oss_tensor = context.Output<framework::Tensor>("OutScales");
+      oss_tensor->mutable_data<T>(
+          context.Input<framework::Tensor>("InScales")->place());
+      auto* oci_tensor = context.Output<framework::Tensor>("OutCurrentIter");
+      oci_tensor->mutable_data<T>(
+          context.Input<framework::Tensor>("InCurrentIter")->place());
+    }
+
+    T scale = static_cast<T>(1);
+    int window_size = context.Attr<int>("window_size");
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto raw_in = framework::EigenVector<T>::Flatten(*in);
+    if (quantize_type == std::string("abs_max")) {
+      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
+      auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
+      scale_out.device(dev) = raw_in.abs().maximum();
+      scale = scale_out(0);
+
+      auto& device_ctx = context.template device_context<DeviceContext>();
+      auto* scale_list = context.Output<framework::Tensor>("OutScales");
+      math::SetConstant<DeviceContext, T> scalar;
+      scale_list->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, scale_list, static_cast<T>(0));
+      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+      iter->mutable_data<T>(context.GetPlace());
+      scalar(device_ctx, iter, static_cast<T>(0));
+    } else if (quantize_type == std::string("range_abs_max")) {
+      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
+      if (is_test) {
+        scale = moving_scale->data<T>()[0];
+      } else {
+        auto* it = context.Input<framework::Tensor>("InCurrentIter");
+        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
+        const int* last_iter = it->data<int>();
+        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
+        auto* scale_list = context.Output<framework::Tensor>("OutScales");
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
+        scale_out.device(dev) = raw_in.abs().maximum();
+        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
+        scale = FindRangeAbsMax(scale_list, saving_scale, scale, window_size,
+                                current_iter[0]);
+        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
+        (*current_iter) = (*last_iter) + 1;
+      }
+    } else if (quantize_type == std::string("moving_average_abs_max")) {
+      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
+      if (is_test) {
+        scale = moving_scale->data<T>()[0];
+      } else {
+        auto* saving_scale =
+            context.Output<framework::Tensor>("OutMovingScale");
+        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
+        scale_out.device(dev) = raw_in.abs().maximum();
+        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
+        scale = FindMovingAverageAbsMmax(
+            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
+        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
+      }
+    }
+
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), in->data<T>(),
+          in->data<T>() + in->numel(), tensor->mutable_data<T>(in->place()),
+          ClipFunctor<T>(-scale, scale));
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
+    eigen_out.device(dev) = (bin_cnt / scale * eigen_in).round();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
deleted file mode 120000
index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3
--- /dev/null
+++ b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
deleted file mode 120000
index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3
--- /dev/null
+++ b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
deleted file mode 120000
index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3
--- /dev/null
+++ b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
diff --git a/patches/grpc/fix_too_early_destory.patch b/patches/grpc/fix_too_early_destory.patch
new file mode 100644
index 0000000000000000000000000000000000000000..d7790d56b07551b8daae9b9a41be5432e5b8b9cc
--- /dev/null
+++ b/patches/grpc/fix_too_early_destory.patch
@@ -0,0 +1,47 @@
+diff --git a/include/grpcpp/impl/codegen/completion_queue.h b/include/grpcpp/impl/codegen/completion_queue.h
+index 80c7c41982..3f7d8a7714 100644
+--- a/include/grpcpp/impl/codegen/completion_queue.h
++++ b/include/grpcpp/impl/codegen/completion_queue.h
+@@ -32,6 +32,8 @@
+ #ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+ #define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+ 
++#include <typeinfo>
++
+ #include <grpc/impl/codegen/atm.h>
+ #include <grpcpp/impl/codegen/completion_queue_tag.h>
+ #include <grpcpp/impl/codegen/core_codegen_interface.h>
+@@ -106,7 +108,9 @@ class CompletionQueue : private GrpcLibraryCodegen {
+ 
+   /// Destructor. Destroys the owned wrapped completion queue / instance.
+   ~CompletionQueue() {
+-    g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
++	if (typeid(*g_core_codegen_interface).hash_code() != typeid(CoreCodegenInterface).hash_code()) {
++      g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
++	}
+   }
+ 
+   /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
+diff --git a/include/grpcpp/impl/codegen/grpc_library.h b/include/grpcpp/impl/codegen/grpc_library.h
+index 17c904d71a..a092b2204d 100644
+--- a/include/grpcpp/impl/codegen/grpc_library.h
++++ b/include/grpcpp/impl/codegen/grpc_library.h
+@@ -19,6 +19,8 @@
+ #ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+ #define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+ 
++#include <typeinfo>
++
+ #include <grpcpp/impl/codegen/core_codegen_interface.h>
+ 
+ namespace grpc {
+@@ -47,7 +49,8 @@ class GrpcLibraryCodegen {
+     }
+   }
+   virtual ~GrpcLibraryCodegen() {
+-    if (grpc_init_called_) {
++    if (grpc_init_called_ &&
++		typeid(*g_glip).hash_code() != typeid(GrpcLibraryInterface).hash_code()) {
+       GPR_CODEGEN_ASSERT(g_glip &&
+                          "gRPC library not initialized. See "
+                          "grpc::internal::GrpcLibraryInitializer.");
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 797c0fbcc4a2d61f5cbbf691db19b4cba5d38630..bec161a571d0f98b99ac656f08cbc2d401c943fc 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -91,3 +91,9 @@ endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
     DESTINATION opt/paddle/share/wheels
 )
+
+find_program(PATCHELF_EXECUTABLE patchelf)
+if(NOT PATCHELF_EXECUTABLE)
+  message(FATAL_ERROR "patchelf not found, please install it.\n"
+          "For Ubuntu, the command is: apt-get install -y patchelf.")
+endif()
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
index 07f4dcbdab2fecf84a0a7042a48a8c8a9e5f880d..79ddd8b7e6f31383fa531f398ef37315b92a9807 100644
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -43,7 +43,7 @@ CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
 CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
 
 
-def reader_creator(filename, sub_name):
+def reader_creator(filename, sub_name, cycle=False):
     def read_batch(batch):
         data = batch['data']
         labels = batch.get('labels', batch.get('fine_labels', None))
@@ -56,10 +56,13 @@ def reader_creator(filename, sub_name):
             names = (each_item.name for each_item in f
                      if sub_name in each_item.name)
 
-            for name in names:
-                batch = cPickle.load(f.extractfile(name))
-                for item in read_batch(batch):
-                    yield item
+            while True:
+                for name in names:
+                    batch = cPickle.load(f.extractfile(name))
+                    for item in read_batch(batch):
+                        yield item
+                if not cycle:
+                    break
 
     return reader
 
@@ -94,34 +97,40 @@ def test100():
         'test')
 
 
-def train10():
+def train10(cycle=False):
     """
     CIFAR-10 training set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
 
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: Training reader creator
     :rtype: callable
     """
     return reader_creator(
         paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch')
+        'data_batch',
+        cycle=cycle)
 
 
-def test10():
+def test10(cycle=False):
     """
     CIFAR-10 test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
 
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: Test reader creator.
     :rtype: callable
     """
     return reader_creator(
         paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch')
+        'test_batch',
+        cycle=cycle)
 
 
 def fetch():
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 527044b415533cc640e3cfc5837c08ab0f8b74b1..2354987d20b908a32209f9ac22a2065ee43c3dfd 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -76,7 +76,8 @@ def reader_creator(data_file,
                    dataset_name,
                    mapper,
                    buffered_size=1024,
-                   use_xmap=True):
+                   use_xmap=True,
+                   cycle=False):
     '''
     1. read images from tar file and
         merge images into batch files in 102flowers.tgz_batch/
@@ -96,6 +97,8 @@ def reader_creator(data_file,
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: data reader
     :rtype: callable
     '''
@@ -108,15 +111,18 @@ def reader_creator(data_file,
     file_list = batch_images_from_tar(data_file, dataset_name, img2label)
 
     def reader():
-        for file in open(file_list):
-            file = file.strip()
-            batch = None
-            with open(file, 'r') as f:
-                batch = cPickle.load(f)
-            data = batch['data']
-            labels = batch['label']
-            for sample, label in itertools.izip(data, batch['label']):
-                yield sample, int(label) - 1
+        while True:
+            for file in open(file_list):
+                file = file.strip()
+                batch = None
+                with open(file, 'r') as f:
+                    batch = cPickle.load(f)
+                data = batch['data']
+                labels = batch['label']
+                for sample, label in itertools.izip(data, batch['label']):
+                    yield sample, int(label) - 1
+            if not cycle:
+                break
 
     if use_xmap:
         cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
@@ -125,7 +131,7 @@ def reader_creator(data_file,
         return map_readers(mapper, reader)
 
 
-def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers training set reader.
     It returns a reader, each sample in the reader is
@@ -138,17 +144,23 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: train data reader
     :rtype: callable
     '''
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TRAIN_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
 
 
-def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers test set reader.
     It returns a reader, each sample in the reader is
@@ -161,14 +173,20 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: test data reader
     :rtype: callable
     '''
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TEST_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
 
 
 def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 6af01297df54ffd4201776d20d51a88f5808ccb0..bcfc716739bb0d6fea9aa34db22473c9726d62a1 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -30,6 +30,7 @@ __all__ = [
     'detection_output',
     'ssd_loss',
     'detection_map',
+    'rpn_target_assign',
     'anchor_generator',
 ]
 
@@ -44,6 +45,135 @@ for _OP in set(__auto__):
     globals()[_OP] = generate_layer_fn(_OP)
 
 
+def rpn_target_assign(loc,
+                      scores,
+                      anchor_box,
+                      gt_box,
+                      rpn_batch_size_per_im=256,
+                      fg_fraction=0.25,
+                      rpn_positive_overlap=0.7,
+                      rpn_negative_overlap=0.3):
+    """
+    ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **
+
+    This layer can be, for given the  Intersection-over-Union (IoU) overlap
+    between anchors and ground truth boxes, to assign classification and
+    regression targets to each each anchor, these target labels are used for
+    train RPN. The classification targets is a binary class label (of being
+    an object or not). Following the paper of Faster-RCNN, the positive labels
+    are two kinds of anchors: (i) the anchor/anchors with the highest IoU
+    overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap
+    higher than rpn_positive_overlap(0.7) with any ground-truth box. Note
+    that a single ground-truth box may assign positive labels to multiple
+    anchors. A non-positive anchor is when its IoU ratio is lower than
+    rpn_negative_overlap (0.3) for all ground-truth boxes. Anchors that are
+    neither positive nor negative do not contribute to the training objective.
+    The regression targets are the encoded ground-truth boxes associated with
+    the positive anchors.
+
+    Args:
+        loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
+            predicted locations of M bounding bboxes. N is the batch size,
+            and each bounding box has four coordinate values and the layout
+            is [xmin, ymin, xmax, ymax].
+        scores(Variable): A 3-D Tensor with shape [N, M, C] represents the
+            predicted confidence predictions. N is the batch size, C is the
+            class number, M is number of bounding boxes. For each category
+            there are total M scores which corresponding M bounding boxes.
+        anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
+            each box is represented as [xmin, ymin, xmax, ymax],
+            [xmin, ymin] is the left top coordinate of the anchor box,
+            if the input is image feature map, they are close to the origin
+            of the coordinate system. [xmax, ymax] is the right bottom
+            coordinate of the anchor box.
+        gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D
+            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
+            bboxes of mini-batch input.
+        rpn_batch_size_per_im(int): Total number of RPN examples per image.
+        fg_fraction(float): Target fraction of RoI minibatch that is labeled
+            foreground (i.e. class > 0), 0-th class is background.
+        rpn_positive_overlap(float): Minimum overlap required between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be a positive
+            example.
+        rpn_negative_overlap(float): Maximum overlap allowed between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be a negative
+            examples.
+
+    Returns:
+        tuple: 
+               A tuple(predicted_scores, predicted_location, target_label,
+               target_bbox) is returned. The predicted_scores and
+               predicted_location is the predicted result of the RPN.
+               The target_label and target_bbox is the ground truth,
+               respectively. The predicted_location is a 2D Tensor with shape
+               [F, 4], and the shape of target_bbox is same as the shape of
+               the predicted_location, F is the number of the foreground
+               anchors. The predicted_scores is a 2D Tensor with shape
+               [F + B, 1], and the shape of target_label is same as the shape
+               of the predicted_scores, B is the number of the background
+               anchors, the F and B is depends on the input of this operator. 
+
+    Examples:
+        .. code-block:: python
+
+        loc = layers.data(name='location', shape=[2, 80],
+                          append_batch_size=False, dtype='float32')
+        scores = layers.data(name='scores', shape=[2, 40],
+                          append_batch_size=False, dtype='float32')
+        anchor_box = layers.data(name='anchor_box', shape=[20, 4],
+                          append_batch_size=False, dtype='float32')
+        gt_box = layers.data(name='gt_box', shape=[10, 4],
+                         append_batch_size=False, dtype='float32')
+        loc_pred, score_pred, loc_target, score_target =
+            fluid.layers.detection_output(loc=location,
+                                          scores=scores,
+                                          anchor_box=anchor_box,
+                                          gt_box=gt_box)
+    """
+
+    helper = LayerHelper('rpn_target_assign', **locals())
+    # 1. Compute the regression target bboxes
+    target_bbox = box_coder(
+        prior_box=anchor_box,
+        target_box=gt_box,
+        code_type='encode_center_size',
+        box_normalized=False)
+
+    # 2. Compute overlaps between the prior boxes and the gt boxes overlaps
+    iou = iou_similarity(x=gt_box, y=anchor_box)
+
+    # 3. Assign target label to anchors
+    loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    score_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    target_label = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    helper.append_op(
+        type="rpn_target_assign",
+        inputs={'Overlap': iou, },
+        outputs={
+            'LocationIndex': loc_index,
+            'ScoreIndex': score_index,
+            'TargetLabel': target_label,
+        },
+        attrs={
+            'rpn_batch_size_per_im': rpn_batch_size_per_im,
+            'rpn_positive_overlap': rpn_positive_overlap,
+            'rpn_negative_overlap': rpn_negative_overlap,
+            'fg_fraction': fg_fraction,
+        })
+
+    # 4. Reshape and gather the target entry
+    scores = nn.reshape(x=scores, shape=(-1, 1))
+    loc = nn.reshape(x=loc, shape=(-1, 4))
+    target_label = nn.reshape(x=target_label, shape=(-1, 1))
+    target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4))
+
+    predicted_scores = nn.gather(scores, score_index)
+    predicted_location = nn.gather(loc, loc_index)
+    target_label = nn.gather(target_label, score_index)
+    target_bbox = nn.gather(target_bbox, loc_index)
+    return predicted_scores, predicted_loc, target_label, target_bbox
+
+
 def detection_output(loc,
                      scores,
                      prior_box,
@@ -388,7 +518,6 @@ def target_assign(input,
 
     Returns:
         tuple: 
-        
                A tuple(out, out_weight) is returned. out is a 3D Tensor with 
                shape [N, P, K], N and P is the same as they are in 
                `neg_indices`, K is the same as it in input of X. If 
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c6aa9d3bb656740c528c728efafc6a47e8bff91
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -0,0 +1,51 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestFakeQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize"
+        self.attrs = {
+            'bit_length': 8,
+            'quantize_type': 'abs_max',
+            'window_size': 10000
+        }
+        self.inputs = {
+            'X': np.random.random((10, 10)).astype("float32"),
+            'InScales': np.zeros(self.attrs['window_size']).astype("float32"),
+            'InCurrentIter': np.zeros(1).astype("float32"),
+            'InMovingScale': np.zeros(1).astype("float32")
+        }
+        self.scale = {
+            'abs_max': np.max(np.abs(self.inputs['X'])).astype("float32")
+        }
+        self.outputs = {
+            'Out': np.round(self.inputs['X'] / self.scale['abs_max'] * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1)),
+            'OutScales': np.zeros(self.attrs['window_size']).astype("float32"),
+            'OutMovingScale':
+            np.array([self.scale['abs_max']]).astype("float32"),
+            'OutCurrentIter': np.zeros(1).astype("float32")
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..df6e0faaca6fd007b39a8f358d964055e149a025
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
+                      rpn_negative_overlap, fg_fraction):
+    iou = np.transpose(iou)
+    anchor_to_gt_max = iou.max(axis=1)
+    gt_to_anchor_argmax = iou.argmax(axis=0)
+    gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])]
+    anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0]
+
+    tgt_lbl = np.ones((iou.shape[0], ), dtype=np.int32) * -1
+    tgt_lbl[anchors_with_max_overlap] = 1
+    tgt_lbl[anchor_to_gt_max >= rpn_positive_overlap] = 1
+
+    num_fg = int(fg_fraction * rpn_batch_size_per_im)
+    fg_inds = np.where(tgt_lbl == 1)[0]
+    if len(fg_inds) > num_fg:
+        disable_inds = np.random.choice(
+            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
+        tgt_lbl[disable_inds] = -1
+    fg_inds = np.where(tgt_lbl == 1)[0]
+
+    num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1)
+    bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
+    if len(bg_inds) > num_bg:
+        enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
+        tgt_lbl[enable_inds] = 0
+    bg_inds = np.where(tgt_lbl == 0)[0]
+
+    loc_index = fg_inds
+    score_index = np.hstack((fg_inds, bg_inds))
+    tgt_lbl = np.expand_dims(tgt_lbl, axis=1)
+    return loc_index, score_index, tgt_lbl
+
+
+class TestRpnTargetAssignOp(OpTest):
+    def setUp(self):
+        iou = np.random.random((10, 8)).astype("float32")
+        self.op_type = "rpn_target_assign"
+        self.inputs = {'DistMat': iou}
+        self.attrs = {
+            'rpn_batch_size_per_im': 256,
+            'rpn_positive_overlap': 0.95,
+            'rpn_negative_overlap': 0.3,
+            'fg_fraction': 0.25,
+            'fix_seed': True
+        }
+        loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 256, 0.95, 0.3,
+                                                            0.25)
+        self.outputs = {
+            'LocationIndex': loc_index,
+            'ScoreIndex': score_index,
+            'TargetLabel': tgt_lbl,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestRpnTargetAssignOp2(OpTest):
+    def setUp(self):
+        iou = np.random.random((10, 20)).astype("float32")
+        self.op_type = "rpn_target_assign"
+        self.inputs = {'DistMat': iou}
+        self.attrs = {
+            'rpn_batch_size_per_im': 128,
+            'rpn_positive_overlap': 0.5,
+            'rpn_negative_overlap': 0.5,
+            'fg_fraction': 0.5,
+            'fix_seed': True
+        }
+        loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 128, 0.5, 0.5,
+                                                            0.5)
+        self.outputs = {
+            'LocationIndex': loc_index,
+            'ScoreIndex': score_index,
+            'TargetLabel': tgt_lbl,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index a0cb39070bf7a89e3ea4cb1d31f54f919d6ff74e..52138b414e3d908e7aa589e76fe924e138e54d83 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -156,12 +156,15 @@ if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)
     package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
 if '${WITH_MKLDNN}' == 'ON':
+    # TODO(typhoonzero): use install_name_tool to patch mkl libs once
+    # we can support mkl on mac.
+    #
     # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
     # The reason is that all thirdparty libraries in the same directory,
     # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
     command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
     if os.system(command) != 0:
-        raise Exception("patchelf --set-rpath for libmkldnn.so.0 fails")
+        raise Exception("patch libmkldnn.so failed, command: %s" % command)
     package_data['paddle.libs']+=['libmkldnn.so.0']
     shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 # remove unused paddle/libs/__init__.py
@@ -172,9 +175,12 @@ package_dir['paddle.libs']=libs_path
 # The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
 # core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
-command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
+if "@APPLE@" == "1":
+    command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
+else:
+    command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
 if os.system(command) != 0:
-    raise Exception("patchelf --set-rpath for core.so fails")
+    raise Exception("patch core.so failed, command: %s" % command)
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',