Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into channel_wise_quant_op

test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into channel_wise_quant_op
test=develop
e8f9dac7 · Zhen Wang · 806832e0 · 6d8771b5 · e8f9dac7 · e8f9dac7
53 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <memory>
 #include <string>
 #include <vector>

@@ -76,11 +77,11 @@ struct BuildStrategy {

  bool fuse_relu_depthwise_conv_{false};

-  bool memory_optimize_{false};
+  bool memory_optimize_{true};
  // TODO(dzhwinter):
  // make enable_inplace, memory_optimize_
  // memory_early_delete_ true by default
-  bool enable_inplace_{false};
+  bool enable_inplace_{true};

  bool enable_sequential_execution_{false};


--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -55,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
  std::vector<FetchOpHandle *> fetch_ops;

  for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -882,7 +882,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
  const RuntimeContext& ctx_;
 };

-static void CheckTensorNANOrInf(const std::string& name,
+static void CheckTensorNANOrInf(const std::string& op_type,
+                                const std::string& name,
                                const framework::Tensor& tensor) {
  if (tensor.memory_size() == 0) {
    return;
@@ -892,9 +893,9 @@ static void CheckTensorNANOrInf(const std::string& name,
    return;
  }
  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
-                 "Tensor %s contains Inf", name);
+                 "Operator %s output Tensor %s contains Inf", op_type, name);
  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
-                 "Tensor %s contains NAN", name);
+                 "Operator %s output Tensor %s contains NAN", op_type, name);
 }

 void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
@@ -988,9 +989,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
      auto* var = exec_scope.FindVar(vname);
      if (var == nullptr) continue;
      if (var->IsType<framework::LoDTensor>()) {
-        CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+        CheckTensorNANOrInf(type_, vname, var->Get<framework::LoDTensor>());
      } else if (var->IsType<framework::SelectedRows>()) {
-        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
+        CheckTensorNANOrInf(type_, vname,
+                            var->Get<framework::SelectedRows>().value());
      }
    }
  }

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/tensor_util.h"
 #include <algorithm>
 #include <limits>
+#include <memory>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"


--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -127,6 +127,12 @@ void Conv2DTransposeOpMaker::Make() {
      "output feature channels,"
      "H is the height of the filter, and W is the width of the filter. "
      "We enforce groups number == 1 in the convolution transpose scenario.");
+  AddInput("Bias",
+           "(Tensor) Bias to be added to each output of filter application."
+           "The format of output tensor is X (one-dimensional) of size equal"
+           "to the number of output channels. Only used with MKL-DNN.")
+      .AsDispensable();
+
  AddOutput("Output",
            "(Tensor) The output tensor of convolution transpose operator. "
            "The format of output tensor is also NCHW.");

--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -84,13 +84,13 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault("bilinear");
    AddAttr<bool>(
        "align_corners",
-        "an optinal bool. Defaults to True. "
+        "an optional bool. Defaults to True. "
        "If True, the centers of 4 corner pixels of the input and output "
        "tensors are aligned, preserving the values at the corner pixels, "
-        "if Flase, are not aligned")
+        "If False, are not aligned")
        .SetDefault(true);
    AddAttr<int>("align_mode",
-                 "(int, default \'1\'), optional for bilinear interpolation"
+                 "(int, default \'1\'), optional for bilinear interpolation, "
                 "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
                 "can be \'1\' for src_idx = scale*dst_index .")
        .SetDefault(1);

--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -14,6 +14,7 @@ limitations under the License. */

 #include <algorithm>
 #include <functional>
+#include <memory>
 #include <vector>

 #include "ngraph/ngraph.hpp"

--- a/paddle/fluid/operators/ngraph/ngraph_bridge.h
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <algorithm>
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>


--- a/paddle/fluid/operators/ngraph/ops/accuracy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/activation_op.h
+++ b/paddle/fluid/operators/ngraph/ops/activation_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>

 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
@@ -53,4 +55,4 @@ void BuildTanhGradNode(
 }  // namespace paddle

 REGISTER_NG_OP(relu_grad, BuildReluGradNode);
-REGISTER_NG_OP(than_grad, BuildTanhGradNode);
+REGISTER_NG_OP(tanh_grad, BuildTanhGradNode);
--- a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
+++ b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #include "ngraph/ngraph.hpp"

--- a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"

--- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once

 #include <functional>
+#include <memory>
 #include <string>
+#include <unordered_map>

 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #include "ngraph/ngraph.hpp"

--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/mean_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mean_op.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once

 #include <functional>
+#include <memory>
 #include <string>
+#include <unordered_map>

 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"

--- a/paddle/fluid/operators/ngraph/ops/momentum_op.h
+++ b/paddle/fluid/operators/ngraph/ops/momentum_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/mul_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"

--- a/paddle/fluid/operators/ngraph/ops/pool2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #include "ngraph/ngraph.hpp"

--- a/paddle/fluid/operators/ngraph/ops/scale_op.h
+++ b/paddle/fluid/operators/ngraph/ops/scale_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"

--- a/paddle/fluid/operators/ngraph/ops/softmax_op.h
+++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"

--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"

--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/operators/reader/buffered_reader.h"
+#include <memory>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"


--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -64,8 +64,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
    auto* out = ctx.Output<LoDTensor>("Out");

    auto lod = in->lod();
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
-    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+    PADDLE_ENFORCE_EQ(lod[lod.size() - 1].back(), (size_t)in->numel(),
                      "The actual size mismatches with the LoD information.");
    auto tokens = ctx.Attr<std::vector<int>>("tokens");
    auto in_len = in->numel();
@@ -85,10 +84,9 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
                           num_erased.begin() + 1);

    // Copy LoD to GPU
-    auto lod0 = lod[0];
-    auto lod_len = lod0.size();
-    const size_t* dev_in_lod_ptr = lod0.CUDAData(ctx.GetPlace());
-
+    auto last_lod = lod[lod.size() - 1];
+    auto lod_len = last_lod.size();
+    const size_t* dev_in_lod_ptr = last_lod.CUDAData(ctx.GetPlace());
    // Calc output LoD
    thrust::device_vector<size_t> dev_out_lod(lod_len);
    size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
@@ -96,13 +94,16 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
        num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
    // Set LoD for output
-    std::vector<size_t> out_lod0(dev_out_lod.begin(), dev_out_lod.end());
+    std::vector<size_t> out_last_lod(dev_out_lod.begin(), dev_out_lod.end());
    framework::LoD out_lod;
-    out_lod.push_back(out_lod0);
+    for (size_t i = 0; i < lod.size() - 1; ++i) {
+      out_lod.push_back(lod[i]);
+    }
+    out_lod.push_back(out_last_lod);
    out->set_lod(out_lod);

    // Set output
-    out->Resize({static_cast<int64_t>(out_lod0.back()), 1});
+    out->Resize({static_cast<int64_t>(out_last_lod.back()), 1});
    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
    SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,

--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
@@ -28,19 +28,18 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
    auto* out = ctx.Output<framework::LoDTensor>("Out");

    auto lod = in->lod();
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
-    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+    PADDLE_ENFORCE_EQ(lod[lod.size() - 1].back(), (size_t)in->numel(),
                      "The actual size mismatches with the LoD information.");
    auto tokens = ctx.Attr<std::vector<int>>("tokens");
    auto in_len = in->numel();
    auto in_dat = in->data<T>();
-    auto lod0 = lod[0];
+    auto last_lod = lod[lod.size() - 1];

    std::vector<size_t> num_erased(in_len + 1, 0);
-    std::vector<size_t> out_lod0(1, 0);
-    for (size_t i = 0; i < lod0.size() - 1; ++i) {
+    std::vector<size_t> out_last_lod(1, 0);
+    for (size_t i = 0; i < last_lod.size() - 1; ++i) {
      size_t num_out = 0;
-      for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) {
+      for (auto j = last_lod[i] + 1; j <= last_lod[i + 1]; ++j) {
        num_erased[j] = num_erased[j - 1];
        if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) !=
            tokens.end()) {
@@ -49,7 +48,7 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
          num_out += 1;
        }
      }
-      out_lod0.push_back(out_lod0.back() + num_out);
+      out_last_lod.push_back(out_last_lod.back() + num_out);
    }

    auto out_len = in_len - num_erased[in_len];
@@ -62,7 +61,10 @@ class SequenceEraseKernel : public framework::OpKernel<T> {
      }
    }
    framework::LoD out_lod;
-    out_lod.push_back(out_lod0);
+    for (size_t i = 0; i < lod.size() - 1; ++i) {
+      out_lod.push_back(lod[i]);
+    }
+    out_lod.push_back(out_last_lod);
    out->set_lod(out_lod);
  }
 };

--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/spectral_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class SpectralNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of SpectralNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("U"),
+                   "Input(U) of SpectralNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("V"),
+                   "Input(V) of SpectralNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SpectralNormOp should not be null.");
+
+    auto dim_weight = ctx->GetInputDim("Weight");
+    auto rank_weight = dim_weight.size();
+    PADDLE_ENFORCE(rank_weight >= 2 && rank_weight <= 5,
+                   "The rank of Input(Weights) can only be 2, 3,"
+                   "4, 5 for fc, conv1d, conv2d, conv3d layers.");
+
+    int dim = ctx->Attrs().Get<int>("dim");
+    int power_iters = ctx->Attrs().Get<int>("power_iters");
+    PADDLE_ENFORCE(dim == 0 || dim == 1, "Attr(dim) can only be 0 or 1");
+    PADDLE_ENFORCE(power_iters >= 0,
+                   "Attr(power_iters) should be larger equal then 0");
+
+    int h = dim_weight[dim];
+    int w = 1;
+    for (int i = 0; i < rank_weight; i++) {
+      if (i != dim) {
+        w *= dim_weight[i];
+      }
+    }
+    auto dim_u = ctx->GetInputDim("U");
+    auto dim_v = ctx->GetInputDim("V");
+    PADDLE_ENFORCE_EQ(dim_u[0], h,
+                      "Input(U) dims[0] should be equal to "
+                      "Input(Weight) dims[Attr(dim)]");
+    PADDLE_ENFORCE_EQ(
+        dim_v[0], w,
+        "Input(V) dims[0] should be equal to "
+        "the product of Input(Weight) dims except dims[Attr(dim)]");
+
+    ctx->SetOutputDim("Out", dim_weight);
+    ctx->ShareLoD("Weight", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("Weight")->type(),
+                                   ctx.GetPlace());
+  }
+};
+
+class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Weight",
+             "The input weight tensor of spectral_norm operator, "
+             "This can be a 2-D, 3-D, 4-D, 5-D tensor which is the "
+             "weights of fc, conv1d, conv2d, conv3d layer.");
+    AddInput("U",
+             "The weight_u tensor of spectral_norm operator, "
+             "This can be a 1-D tensor in shape [H, 1],"
+             "H is the 1st dimentions of Weight after reshape"
+             "corresponding by Attr(dim). As for Attr(dim) = 1"
+             "in conv2d layer with weight shape [M, C, K1, K2]"
+             "Weight will be reshape to [C, M*K1*K2], U will"
+             "be in shape [C, 1].");
+    AddInput("V",
+             "The weight_v tensor of spectral_norm operator, "
+             "This can be a 1-D tensor in shape [W, 1], "
+             "W is the 2nd dimentions of Weight after reshape "
+             "corresponding by Attr(dim). As for Attr(dim) = 1 "
+             "in conv2d layer with weight shape [M, C, K1, K2] "
+             "Weight will be reshape to [C, M*K1*K2], V will "
+             "be in shape [M*K1*K2, 1].");
+    AddOutput("Out",
+              "The output weight tensor of spectral_norm operator, "
+              "This tensor is in same shape with Input(Weight).");
+
+    AddAttr<int>("dim",
+                 "The index of dimension which should be permuted "
+                 "to the first before reshaping Input(Weight) to "
+                 "matrix, it should be set as 0 if Input(Weight) is "
+                 "the weight of fc layer, and should be set as 1 if "
+                 "Input(Weight) is the weight of conv layer, "
+                 "default 0.")
+        .SetDefault(0);
+    AddAttr<int>("power_iters",
+                 "number of power iterations to calculate "
+                 "spectral norm, default 1.")
+        .SetDefault(1);
+    AddAttr<float>("eps",
+                   "epsilon for numerical stability in "
+                   "calculating norms")
+        .SetDefault(1e-12);
+
+    AddComment(R"DOC(
+          This layer calculates the spectral normalization value of weight of
+          fc, conv1d, conv2d, conv3d layers which should be 2-D, 3-D, 4-D, 5-D
+          tensor.
+
+          Spectral normalization stabilizes the training of critic in GANs
+          (Generative Adversarial Networks). This layer rescaling weight tensor
+          with spectral normalize value.
+
+          For spectral normalization calculations, we rescaling weight
+          tensor with :math:`\sigma`, while :math:`\sigma{\mathbf{W}}` is
+
+            $$\sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \\frac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2}$$
+
+          We calculate :math:`\sigma{\mathbf{W}}` through power iterations as
+
+            $$
+            \mathbf{v} = \mathbf{W}^{T} \mathbf{u}
+            $$
+            $$
+            \mathbf{v} = \\frac{\mathbf{v}}{\|\mathbf{v}\|_2}
+            $$
+            $$
+            \mathbf{u} = \mathbf{W}^{T} \mathbf{v}
+            $$
+            $$
+            \mathbf{u} = \\frac{\mathbf{u}}{\|\mathbf{u}\|_2}
+            $$
+
+          And :math:`\sigma` should be
+
+            $$\sigma{\mathbf{W}} = \mathbf{u}^{T} \mathbf{W} \mathbf{v}$$
+
+          For details of spectral normalization, please refer to paper: 
+          `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
+         )DOC");
+  }
+};
+
+class SpectralNormOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Weight"), "Input(Weight) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto dim_x = ctx->GetInputDim("Weight");
+    if (ctx->HasOutput(framework::GradVarName("Weight"))) {
+      ctx->SetOutputDim(framework::GradVarName("Weight"), dim_x);
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("Weight")->type(),
+                                   ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(spectral_norm, ops::SpectralNormOp, ops::SpectralNormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(spectral_norm_grad, ops::SpectralNormOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    spectral_norm,
+    ops::SpectralNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SpectralNormKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    spectral_norm_grad,
+    ops::SpectralNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SpectralNormGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/spectral_norm_op.cu
+++ b/paddle/fluid/operators/spectral_norm_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/spectral_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    spectral_norm,
+    ops::SpectralNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SpectralNormKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    spectral_norm_grad,
+    ops::SpectralNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SpectralNormGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/spectral_norm_op.h
+++ b/paddle/fluid/operators/spectral_norm_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using Tensor = framework::Tensor;
+
+using Array1 = Eigen::DSizes<int64_t, 1>;
+using Array2 = Eigen::DSizes<int64_t, 2>;
+using IndexPair = Eigen::IndexPair<int>;
+
+template <typename DeviceContext, typename T>
+static inline void TransCompute(const int rank, const Tensor& in, Tensor* out,
+                                const std::vector<int>& perm,
+                                const DeviceContext& dev_ctx) {
+  if (rank <= 1 || rank > 5) {
+    PADDLE_THROW("Invalid weight rank.");
+  }
+
+  switch (rank) {
+    case 2:
+      math::Transpose<DeviceContext, T, 2> trans2;
+      trans2(dev_ctx, in, out, perm);
+      break;
+    case 3:
+      math::Transpose<DeviceContext, T, 3> trans3;
+      trans3(dev_ctx, in, out, perm);
+      break;
+    case 4:
+      math::Transpose<DeviceContext, T, 4> trans4;
+      trans4(dev_ctx, in, out, perm);
+      break;
+    case 5:
+      math::Transpose<DeviceContext, T, 5> trans5;
+      trans5(dev_ctx, in, out, perm);
+      break;
+    default:
+      break;
+  }
+}
+
+template <typename DeviceContext, typename T>
+static inline void CalcMatrixSigmaAndNormWeight(
+    Tensor* sigma, Tensor* u, Tensor* v, Tensor* weight, const int power_iters,
+    const float eps, const framework::ExecutionContext& ctx) {
+  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+  auto sigma_t = EigenTensor<T, 2>::From(*sigma);
+  auto weight_t = EigenTensor<T, 2>::From(*weight);
+  auto u_t = EigenTensor<T, 2>::From(*u);
+  auto v_t = EigenTensor<T, 2>::From(*v);
+
+  const int h = weight->dims()[0];
+  const int w = weight->dims()[1];
+
+  for (int i = 0; i < power_iters; i++) {
+    // V = W^T * U / ||W^T * U||_2
+    blas.MatMul(*weight, true, *u, false, T(1), v, T(0));
+    auto v_t_norm =
+        v_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast(
+            Array1(w));
+    v_t.device(place) = v_t / (v_t_norm + v_t_norm.constant(eps));
+    // U = W^T * V / ||W^T * V||_2
+    blas.MatMul(*weight, false, *v, false, T(1), u, T(0));
+    auto u_t_norm =
+        u_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast(
+            Array1(h));
+    u_t.device(place) = u_t / (u_t_norm + u_t_norm.constant(eps));
+  }
+  Tensor weight_v;
+  weight_v.mutable_data<T>({h, 1}, ctx.GetPlace());
+  blas.MatMul(*weight, false, *v, false, T(1), &weight_v, T(0));
+  auto weight_v_t = EigenTensor<T, 2>::From(weight_v);
+  sigma_t.device(place) = (u_t * weight_v_t)
+                              .sum()
+                              .eval()
+                              .reshape(Array2(1, 1))
+                              .broadcast(Array2(h, w));
+  weight_t.device(place) = weight_t / sigma_t;
+}
+
+template <typename DeviceContext, typename T>
+class SpectralNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto weight = ctx.Input<Tensor>("Weight");
+    auto u = ctx.Input<Tensor>("U");
+    auto v = ctx.Input<Tensor>("V");
+    auto out = ctx.Output<Tensor>("Out");
+
+    int dim = ctx.Attr<int>("dim");
+    int power_iters = ctx.Attr<int>("power_iters");
+    float eps = ctx.Attr<float>("eps");
+
+    const int h = u->dims()[0];
+    const int w = v->dims()[0];
+
+    Tensor weight_mat;
+    auto dims = weight->dims();
+    const int rank = dims.size();
+    std::vector<int> real_dims;
+    if (dim != 0) {
+      std::vector<int> perm;
+      perm.push_back(dim);
+      real_dims.push_back(dims[dim]);
+      for (int i = 0; i < rank; i++) {
+        if (i != dim) {
+          perm.push_back(i);
+          real_dims.push_back(dims[i]);
+        }
+      }
+      weight_mat.mutable_data<T>(framework::make_ddim(real_dims),
+                                 ctx.GetPlace());
+      TransCompute<DeviceContext, T>(rank, *weight, &weight_mat, perm, dev_ctx);
+    } else {
+      for (int i = 0; i < rank; i++) {
+        real_dims.push_back(i);
+      }
+      TensorCopySync(*weight, ctx.GetPlace(), &weight_mat);
+    }
+    weight_mat = weight_mat.Resize({h, w});
+
+    Tensor sigma;
+    sigma.mutable_data<T>(weight_mat.dims(), ctx.GetPlace());
+    Tensor uu, vv;
+    TensorCopySync(*u, ctx.GetPlace(), &uu);
+    TensorCopySync(*v, ctx.GetPlace(), &vv);
+    CalcMatrixSigmaAndNormWeight<DeviceContext, T>(
+        &sigma, &(uu.Resize({h, 1})), &(vv.Resize({w, 1})), &weight_mat,
+        power_iters, eps, ctx);
+
+    if (dim != 0) {
+      std::vector<int> perm;
+      for (int i = 0; i < rank; i++) {
+        if (i < dim) {
+          perm.push_back(i + 1);
+        } else if (i == dim) {
+          perm.push_back(0);
+        } else {
+          perm.push_back(i);
+        }
+      }
+      out->mutable_data<T>(dims, ctx.GetPlace());
+      TransCompute<DeviceContext, T>(
+          rank, weight_mat.Resize(framework::make_ddim(real_dims)), out, perm,
+          dev_ctx);
+    } else {
+      TensorCopySync(weight_mat.Resize(dims), ctx.GetPlace(), out);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SpectralNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto weight = ctx.Input<Tensor>("Weight");
+    auto u = ctx.Input<Tensor>("U");
+    auto v = ctx.Input<Tensor>("V");
+    auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto weight_grad = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+
+    int dim = ctx.Attr<int>("dim");
+    int power_iters = ctx.Attr<int>("power_iters");
+    float eps = ctx.Attr<float>("eps");
+
+    const int h = u->dims()[0];
+    const int w = v->dims()[0];
+
+    Tensor weight_mat, out_grad_mat;
+    auto dims = weight->dims();
+    const int rank = dims.size();
+    std::vector<int> real_dims;
+    if (dim != 0) {
+      std::vector<int> perm;
+      perm.push_back(dim);
+      real_dims.push_back(dims[dim]);
+      for (int i = 0; i < rank; i++) {
+        if (i != dim) {
+          perm.push_back(i);
+          real_dims.push_back(dims[i]);
+        }
+      }
+      weight_mat.mutable_data<T>(framework::make_ddim(real_dims),
+                                 ctx.GetPlace());
+      out_grad_mat.mutable_data<T>(framework::make_ddim(real_dims),
+                                   ctx.GetPlace());
+      TransCompute<DeviceContext, T>(rank, *weight, &weight_mat, perm, dev_ctx);
+      TransCompute<DeviceContext, T>(rank, *out_grad, &out_grad_mat, perm,
+                                     dev_ctx);
+    } else {
+      for (int i = 0; i < rank; i++) {
+        real_dims.push_back(i);
+      }
+      TensorCopySync(*weight, ctx.GetPlace(), &weight_mat);
+      TensorCopySync(*out_grad, ctx.GetPlace(), &out_grad_mat);
+    }
+    weight_mat = weight_mat.Resize({h, w});
+    out_grad_mat = out_grad_mat.Resize({h, w});
+
+    Tensor sigma;
+    sigma.mutable_data<T>(weight_mat.dims(), ctx.GetPlace());
+    Tensor uu, vv;
+    TensorCopySync(*u, ctx.GetPlace(), &uu);
+    TensorCopySync(*v, ctx.GetPlace(), &vv);
+    CalcMatrixSigmaAndNormWeight<DeviceContext, T>(
+        &sigma, &(uu.Resize({h, 1})), &(vv.Resize({w, 1})), &weight_mat,
+        power_iters, eps, ctx);
+
+    Tensor uv;
+    uv.mutable_data<T>({h, w}, ctx.GetPlace());
+    blas.MatMul(uu.Resize({h, 1}), false, vv.Resize({w, 1}), false, T(1), &uv,
+                T(0));
+
+    Tensor weight_grad_mat;
+    weight_grad_mat.mutable_data<T>({h, w}, ctx.GetPlace());
+    auto weight_grad_mat_t = EigenTensor<T, 2>::From(weight_grad_mat);
+    auto weight_mat_t = EigenTensor<T, 2>::From(weight_mat);
+    auto out_grad_mat_t = EigenTensor<T, 2>::From(out_grad_mat);
+    auto sigma_t = EigenTensor<T, 2>::From(sigma);
+    auto uv_t = EigenTensor<T, 2>::From(uv);
+    weight_mat_t.device(place) =
+        weight_mat_t.sum().eval().reshape(Array2(1, 1)).broadcast(Array2(h, w));
+    weight_grad_mat_t.device(place) =
+        out_grad_mat_t * (out_grad_mat_t.constant(1.0) - uv_t * weight_mat_t) /
+        sigma_t;
+
+    if (dim != 0) {
+      std::vector<int> perm;
+      for (int i = 0; i < rank; i++) {
+        if (i < dim) {
+          perm.push_back(i + 1);
+        } else if (i == dim) {
+          perm.push_back(0);
+        } else {
+          perm.push_back(i);
+        }
+      }
+      weight_grad->mutable_data<T>(dims, ctx.GetPlace());
+      TransCompute<DeviceContext, T>(
+          rank, weight_grad_mat.Resize(framework::make_ddim(real_dims)),
+          weight_grad, perm, dev_ctx);
+    } else {
+      TensorCopySync(weight_grad_mat.Resize(dims), ctx.GetPlace(), weight_grad);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -415,10 +415,11 @@ function assert_api_not_changed() {
    source .env/bin/activate
    pip install ${PADDLE_ROOT}/build/python/dist/*whl
    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec
+
    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
        # Use sed to make python2 and python3 sepc keeps the same
        sed -i 's/arg0: str/arg0: unicode/g' new.spec
-        sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
+        sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
    fi
    # ComposeNotAligned has significant difference between py2 and py3
    sed -i '/.*ComposeNotAligned.*/d' new.spec
@@ -452,12 +453,21 @@ function assert_api_spec_approvals() {
      echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
      if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
          # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-          APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-          python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
+          if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
+            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308`
+          else
+            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
+          fi
          echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
          if [ "${APPROVALS}" == "FALSE" ]; then
+            if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
+              echo "You must have panyx0718 and shanyi15 approval for the api change! ${API_FILE}"
+            else
              echo "You must have panyx0718 approval for the api change! ${API_FILE}"
-              exit 1
+            fi
+            exit 1
          fi
      fi
    done
@@ -472,19 +482,6 @@ function assert_api_spec_approvals() {
            exit 1
        fi
    fi
-
-    pip install ${PADDLE_ROOT}/build/opt/paddle/share/wheels/*.whl
-    CHECK_DOCK_MD5=`python ${PADDLE_ROOT}/tools/check_doc_approval.py`
-    if [ "True" != ${CHECK_DOCK_MD5} ]; then
-        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
-        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-        if [ "${APPROVALS}" == "FALSE" ]; then
-            echo "You must have shanyi15 approval for the api doc change! "
-            exit 1
-        fi
-        echo ${CHECK_DOCK_MD5} >/root/.cache/doc_md5.txt
-    fi
 }



--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -131,7 +131,8 @@ def __bootstrap__():
        'fast_eager_deletion_mode', 'allocator_strategy',
        'reader_queue_speed_test_mode', 'print_sub_graph_dir',
        'pe_profile_fname', 'warpctc_dir', 'inner_op_parallelism',
-        'enable_parallel_graph', 'multiple_of_cupti_buffer_size'
+        'enable_parallel_graph', 'multiple_of_cupti_buffer_size',
+        'enable_subgraph_optimize'
    ]
    if 'Darwin' not in sysstr:
        read_env_flags.append('use_pinned_memory')

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -17,7 +17,6 @@ import os
 import six
 import sys
 from .. import compat as cpt
-from . import framework

 from . import core
 from . import framework
@@ -36,6 +35,30 @@ def _place_obj(place):
    return p


+def _is_pserver_mode(main_program):
+    main = main_program if main_program \
+        else default_main_program()
+    for op in main.global_block().ops:
+        if op.type in ["send", "recv"]:
+            return True
+    return False
+
+
+def get_available_places(use_cuda):
+    if use_cuda:
+        gpus_env = os.getenv("FLAGS_selected_gpus")
+        if gpus_env:
+            gpus = [int(s) for s in gpus_env.split(",")]
+        else:
+            gpus = [i for i in six.moves.range(core.get_cuda_device_count())]
+        places = [core.CUDAPlace(i) for i in gpus]
+    else:
+        cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+        places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
+    assert places, "no place for execution"
+    return places
+
+
 class CompiledProgram(object):
    """
    Compiles to Graph for execution.
@@ -127,8 +150,7 @@ class CompiledProgram(object):
            self._exec_strategy = ExecutionStrategy()
        if self._build_strategy is None:
            self._build_strategy = BuildStrategy()
-        self._build_strategy.is_distribution = framework.is_pserver_mode(
-            self._program)
+        self._build_strategy.is_distribution = _is_pserver_mode(self._program)
        return self

    def with_inference_optimize(self, config):
@@ -153,9 +175,9 @@ class CompiledProgram(object):
    def _with_distributed(self):
        raise NotImplementedError()

-    def _compile_data_parallel(self):
+    def _compile_data_parallel(self, use_cuda=False, scope=None):
        if self._share_vars_from:
-            if self._scope:
+            if scope:
                sys.stderr.write("share_vars_from is set, scope is ignored.\n")
            if not self._share_vars_from._is_data_parallel:
                raise ValueError("share_vars_from is not data parallel. Cannot "
@@ -166,23 +188,11 @@ class CompiledProgram(object):
                    "var to share.")
            self._local_scopes = self._share_vars_from._executor.local_scopes()
        else:
+            assert scope is not None, ""
            self._local_scopes = []

-        self._exec_strategy.use_cuda = isinstance(self._place, core.CUDAPlace)
-        if self._exec_strategy.use_cuda:
-            gpus_env = os.getenv("FLAGS_selected_gpus")
-            if gpus_env:
-                gpus = [int(s) for s in gpus_env.split(",")]
-            else:
-                gpus = [
-                    i for i in six.moves.range(core.get_cuda_device_count())
-                ]
-            self._places = [core.CUDAPlace(i) for i in gpus]
-        else:
-            cpu_num = int(
-                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
-        assert self._places, "no place for execution"
+        self._exec_strategy.use_cuda = use_cuda
+        self._places = get_available_places(self._exec_strategy.use_cuda)

        if self._exec_strategy.num_threads == 0:
            if self._exec_strategy.use_cuda:
@@ -196,10 +206,12 @@ class CompiledProgram(object):

        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
        # if turn on python memory optimize, turn off the inplace_pass.
-        if self._build_strategy.memory_optimize is None:
-            self._build_strategy.memory_optimize = False if self._program and self._program._is_mem_optimized else True
-        if self._build_strategy.enable_inplace is None:
-            self._build_strategy.enable_inplace = False if self._program and self._program._is_mem_optimized else True
+        # memory_optimize and enable_inplace default are True, but we can disable them on purpose
+        if self._program and self._program._is_mem_optimized:
+            self._build_strategy.memory_optimize = False
+
+        if self._program and self._program._is_mem_optimized:
+            self._build_strategy.enable_inplace = False

        # TODO(wuyi): trainer endpoings should be passed in through
        # build_strategy, not program.xxx.
@@ -221,12 +233,12 @@ class CompiledProgram(object):

        places = list(map(_place_obj, self._places))

-        return core.ParallelExecutor(
-            places,
-            set(self._persistable_vars),
-            cpt.to_text(self._loss_name)
-            if self._loss_name else six.u(''), self._scope, self._local_scopes,
-            self._exec_strategy, self._build_strategy, self._graph)
+        return core.ParallelExecutor(places,
+                                     set(self._persistable_vars),
+                                     cpt.to_text(self._loss_name)
+                                     if self._loss_name else six.u(''), scope,
+                                     self._local_scopes, self._exec_strategy,
+                                     self._build_strategy, self._graph)

    def _compile_inference(self):
        return core.create_paddle_predictor(self._infer_config)
@@ -253,7 +265,9 @@ class CompiledProgram(object):
        self._scope = scope
        self._place = place
        if self._is_data_parallel:
-            self._executor = self._compile_data_parallel()
+            self._executor = self._compile_data_parallel(
+                use_cuda=isinstance(self._place, core.CUDAPlace),
+                scope=self._scope)
        elif self._is_inference:
            self._executor = self._compile_inference()
        else:

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):

 class Executor(object):
    """
-    An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
-    ParallelExecutor.
-    Python executor takes a program, add feed operators and fetch operators to this program according
+    An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
+    Python executor takes a program, adds feed operators and fetch operators to this program according
    to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
-    the variables(or names) that user want to get after program run. Note: the executor will run all
+    the variables(or names) that user wants to get after program runs. Note: the executor will run all
    operators in the program but not only the operators dependent by the fetch_list.
-    It store the global variables into the global scope, and create a local scope for the temporary
-    variables. The local scope contents will be discarded after every minibatch forward/backward finished.
-    But the global scope variables will be persistent through different runs.
-    All of ops in program will be running in sequence.
+    It stores the global variables into the global scope, and creates a local scope for the temporary
+    variables. The contents in local scope may be discarded after every minibatch forward/backward
+    finished. But the global scope variables will be persistent through different runs.


    Example:
-    .. code-block:: python
-        # First create the Executor.
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        # Run the startup program once and only once.
-        # Not need to optimize/compile the startup program.
-        exe.run(fluid.default_startup_program())
-
-        # Run the main program directly without compile.
-        loss, = exe.run(fluid.default_main_program(),
-                        feed=feed_dict,
-                        fetch_list=[loss.name])
-        # Or, compiled the program and run. See `CompiledProgram` for more detail.
-        compiled_prog = compiler.CompiledProgram(
-            fluid.default_main_program()).with_data_parallel(
-            loss_name=loss.name)
-        loss, = exe.run(compiled_prog,
-                        feed=feed_dict,
-                        fetch_list=[loss.name])
+
+        .. code-block:: python
+
+            # First create the Executor.
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            # Run the startup program once and only once.
+            # Not need to optimize/compile the startup program.
+            exe.run(fluid.default_startup_program())
+
+            # Run the main program directly without compile.
+            loss, = exe.run(fluid.default_main_program(),
+                            feed=feed_dict,
+                            fetch_list=[loss.name])
+            # Or, compiled the program and run. See `CompiledProgram` for more detail.
+            compiled_prog = compiler.CompiledProgram(
+                fluid.default_main_program()).with_data_parallel(
+                loss_name=loss.name)
+            loss, = exe.run(compiled_prog,
+                            feed=feed_dict,
+                            fetch_list=[loss.name])

    Args:
        place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
-
-    Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
-    They has the exactly same arguments, and expected the same results.
    """

    def __init__(self, place):
@@ -382,6 +379,12 @@ class Executor(object):
        ]
        return outs

+    '''
+    TODO(typhoonzero): Define "no longer use" meaning? Can user create
+    a new Executor for the same program and run?
+    TODO(panyx0718): Why ParallelExecutor doesn't have close?
+    '''
+
    def close(self):
        """
        Close this executor.
@@ -389,9 +392,6 @@ class Executor(object):
        You can no longer use this executor after calling this method.
        For the distributed training, this method would free the resource on PServers related to
        the current Trainer.
-        TODO(typhoonzero): Define "no longer use" meaning? Can user create
-        a new Executor for the same program and run?
-        TODO(panyx0718): Why ParallelExecutor doesn't have close?

        Example:
            >>> cpu = core.CPUPlace()

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -87,15 +87,6 @@ def _current_expected_place():
    return _imperative_current_expected_place_


-def is_pserver_mode(main_program):
-    main = main_program if main_program \
-        else default_main_program()
-    for op in main.global_block().ops:
-        if op.type in ["send", "recv"]:
-            return True
-    return False
-
-
 class NameScope(object):
    def __init__(self, name="", parent=None):
        self._children = dict()

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -468,9 +468,10 @@ def save_persistables(executor, dirname, main_program=None, filename=None):

            exe = fluid.Executor(fluid.CPUPlace())
            param_path = "./my_paddle_model"
+            # `prog` can be a program defined by the user
            prog = fluid.default_main_program()
            fluid.io.save_persistables(executor=exe, dirname=param_path,
-                                       main_program=None)
+                                       main_program=prog)
    """

    if main_program and main_program._is_distributed:

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -94,6 +94,7 @@ __all__ = [
    'multiplex',
    'layer_norm',
    'group_norm',
+    'spectral_norm',
    'softmax_with_cross_entropy',
    'smooth_l1',
    'one_hot',
@@ -3346,6 +3347,98 @@ def group_norm(input,
    return helper.append_activation(group_norm_out)


+@templatedoc()
+def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
+    """
+    **Spectral Normalization Layer**
+
+    This layer calculates the spectral normalization value of weight parameters of
+    fc, conv1d, conv2d, conv3d layers which should be 2-D, 3-D, 4-D, 5-D
+    Parameters. Calculations are showed as follows.
+
+    Step 1:
+    Generate vector U in shape of [H], and V in shape of [W].
+    While H is the :attr:`dim` th dimension of the input weights,
+    and W is the product result of remaining dimensions.
+
+    Step 2:
+    :attr:`power_iters` shoule be a positive interger, do following
+    calculations with U and V for :attr:`power_iters` rounds.
+
+    .. math:: 
+
+        \mathbf{v} := \\frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
+
+        \mathbf{u} := \\frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2}
+
+    Step 3:
+    Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.
+
+    .. math::
+
+        \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
+
+        \mathbf{W} = \\frac{\mathbf{W}}{\sigma(\mathbf{W})}
+                
+
+    Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
+
+    Args:
+        weight(${weight_type}): ${weight_comment}
+        dim(int): ${dim_comment}
+        power_iters(int): ${power_iters_comment}
+        eps(float): ${eps_comment}
+        name (str): The name of this layer. It is optional.
+
+    Returns:
+        Variable: A tensor variable of weight parameters after spectral normalization.
+
+    Examples:
+
+        >>> weight = fluid.layers.data(name='weight', shape=[8, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.spectral_norm(weight=data, dim=1, power_iters=2)
+    """
+    helper = LayerHelper('spectral_norm', **locals())
+    dtype = weight.dtype
+
+    # create intput and parameters
+    inputs = {'Weight': weight}
+    input_shape = weight.shape
+    h = input_shape[dim]
+    w = np.prod(input_shape) // h
+
+    u = helper.create_parameter(
+        attr=ParamAttr(),
+        shape=[h],
+        dtype=dtype,
+        default_initializer=Normal(0., 1.))
+    u.stop_gradient = True
+    inputs['U'] = u
+    v = helper.create_parameter(
+        attr=ParamAttr(),
+        shape=[w],
+        dtype=dtype,
+        default_initializer=Normal(0., 1.))
+    inputs['V'] = v
+    v.stop_gradient = True
+
+    # create output
+    out = helper.create_variable(dtype=dtype)
+
+    helper.append_op(
+        type="spectral_norm",
+        inputs=inputs,
+        outputs={"Out": out, },
+        attrs={
+            "dim": dim,
+            "power_iters": power_iters,
+            "eps": eps,
+        })
+
+    return out
+
+
 def conv2d_transpose(input,
                     num_filters,
                     output_size=None,
@@ -6844,56 +6937,58 @@ def image_resize(input,

    Example:

-      For scale:
-      
-        if align_corners = True && out_size > 1 :
+    .. code-block:: text

-          scale_factor = (in_size-1.0)/(out_size-1.0)
-        
-        else:
+        For scale:
          
-          scale_factor = float(in_size/out_size)
-        
-      
-      Nearest neighbor interpolation:
-      
-      if:
-          align_corners = False
+            if align_corners = True && out_size > 1 :

-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+              scale_factor = (in_size-1.0)/(out_size-1.0)
+            
+            else:
+              
+              scale_factor = float(in_size/out_size)
+            
+          
+        Nearest neighbor interpolation:
+          
+          if:
+              align_corners = False

-          H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
-          W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:

-      else:
-          align_corners = True
+              H_out = floor (H_{in} * scale_{factor})
+              W_out = floor (W_{in} * scale_{factor})

-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+          else:
+              align_corners = True

-          H_out = round(H_{in} * scale_{factor})
-          W_out = round(W_{in} * scale_{factor})
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:

-      Bilinear interpolation:
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})

-      if:
-          align_corners = False , align_mode = 0
-          
-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
-          
-          H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-          W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+        Bilinear interpolation:
+
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5


-      else:
-       
-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+          else:
+           
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:

-          H_out = H_{in} * scale_{factor}
-          W_out = W_{in} * scale_{factor}
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}

    For details of nearest neighbor interpolation, please refer to Wikipedia: 
    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
@@ -7048,41 +7143,39 @@ def resize_bilinear(input,
    Align_corners and align_mode are optinal parameters,the calculation 
    method of interpolation can be selected by them.

-
-    Align_corners and align_mode are optinal parameters,the calculation method 
-    of interpolation can be selected by them.
-
    Example:

-      For scale:
-      
-        if align_corners = True && out_size > 1 :
+    .. code-block:: text

-          scale_factor = (in_size-1.0)/(out_size-1.0)
-        
-        else:
+        For scale:
          
-          scale_factor = float(in_size/out_size)     
+            if align_corners = True && out_size > 1 :

-    Bilinear interpolation:
+              scale_factor = (in_size-1.0)/(out_size-1.0)
+            
+            else:
+              
+              scale_factor = float(in_size/out_size)     

-      if:
-          align_corners = False , align_mode = 0
-          
-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
-          
-          H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-          W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+        Bilinear interpolation:
+
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5


-      else:
+          else:

-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:

-          H_out = H_{in} * scale_{factor}
-          W_out = W_{in} * scale_{factor}
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}



@@ -7134,42 +7227,44 @@ def resize_nearest(input,
                   align_corners=True):
    """
    Resize input by performing nearest neighbor interpolation in both the
-    3rd dimention(in height direction) and the 4th dimention(in width
-    direction) based on given output shape which specified by actual_shape,
+    3rd dimension(in height direction) and the 4th dimension(in width
+    direction) based on given output shape which is specified by actual_shape,
    out_shape and scale in priority order.

    Example:

-      For scale:
-      
-        if align_corners = True && out_size > 1 :
+    .. code-block:: text

-          scale_factor = (in_size-1.0)/(out_size-1.0)
-        
-        else:
+        For scale:
+          
+            if align_corners = True && out_size > 1 :
+
+              scale_factor = (in_size-1.0)/(out_size-1.0)
+            
+            else:
+              
+              scale_factor = float(in_size/out_size)
+            
+          
+        Nearest neighbor interpolation:
          
-          scale_factor = float(in_size/out_size)
-        
-      
-      Nearest neighbor interpolation:
-      
-      if:
-          align_corners = False
+          if:
+              align_corners = False

-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:

-          H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
-          W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+              H_out = floor(H_{in} * scale_{factor})
+              W_out = floor(W_{in} * scale_{factor})

-      else:
-          align_corners = True
+          else:
+              align_corners = True

-          input : (N,C,H_in,W_in)
-          output: (N,C,H_out,W_out) where:
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:

-          H_out = round(H_{in} * scale_{factor})
-          W_out = round(W_{in} * scale_{factor})
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})


    For details of nearest neighbor interpolation, please refer to Wikipedia:

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -13,15 +13,11 @@
 # limitations under the License.

 from __future__ import print_function
-import multiprocessing
 from . import core
 from . import framework
 from . import executor
-from .. import compat as cpt
-import warnings
+from . import compiler
 import sys
-import six
-import os

 __all__ = ['ParallelExecutor']

@@ -97,99 +93,27 @@ class ParallelExecutor(object):
            'Please use CompiledProgram and Executor. CompiledProgram '
            'is a central place for optimization and Executor is the '
            'unified executor. Example can be found in compiler.py.\n')
-        # step1: get places, the places are used in run too.
-        self._places = []
-        if use_cuda:
-            gpus_env = os.getenv("FLAGS_selected_gpus")
-            if gpus_env:
-                gpus = [int(s) for s in gpus_env.split(",")]
-            else:
-                gpus = [
-                    i for i in six.moves.range(core.get_cuda_device_count())
-                ]
-            self._places = [core.CUDAPlace(i) for i in gpus]
-        else:
-            cpu_num = int(
-                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
-        assert self._places, "no place for execution"

-        # step2: init exec_strategy
-        if exec_strategy is None:
-            exec_strategy = ExecutionStrategy()
-        exec_strategy.use_cuda = use_cuda
-        if exec_strategy.num_threads == 0:
-            if use_cuda:
-                # Experiments on se-resnext shows that too many threads hurt
-                # performance. Worth tunning for other models in the future.
-                exec_strategy.num_threads = len(self._places) * 4
-            else:
-                cpu_num = int(
-                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                exec_strategy.num_threads = cpu_num * 2
-
-        # step3: init build_strategy
        if build_strategy is None:
            build_strategy = BuildStrategy()
        build_strategy.num_trainers = num_trainers
        build_strategy.trainer_id = trainer_id
-        # FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
-        # num_trainers is 1, so the current fields of build_strategy doesn't tell if
-        # it's distributed model.
-        build_strategy.is_distribution = framework.is_pserver_mode(
-            main_program) or num_trainers > 1
-
-        # step4: get main_program, scope, local_scopes
-        main = main_program if main_program \
-            else framework.default_main_program()
-        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
-        # if turn on python memory optimize, turn off the inplace_pass.
-        if build_strategy.memory_optimize is None:
-            build_strategy.memory_optimize = False if main._is_mem_optimized else True
-        if build_strategy.enable_inplace is None:
-            build_strategy.enable_inplace = False if main._is_mem_optimized else True
-        scope = scope if scope is not None else executor.global_scope()
-
-        if share_vars_from and not isinstance(share_vars_from,
-                                              ParallelExecutor):
-            raise TypeError("share_vars_from must be ParallelExecutor.")
-
-        local_scopes = share_vars_from.executor.local_scopes()\
-            if share_vars_from else []
-
-        # step5: check trainers_endpoints, it is used for distribution.
-        trainers_endpoints = main._trainers_endpoints
-        if num_trainers > 1 and trainers_endpoints:
-            assert num_trainers == len(
-                trainers_endpoints), "num_trainers == len(endpoints)"
-            build_strategy.trainers_endpoints = trainers_endpoints
-
-        # step6: get persistable_vars, places. persistable_vars
-        # need be broadcast to other local_scope.
-        persistable_vars = set([
-            cpt.to_text(v.name) for v in [
-                var for var in main.list_vars()
-                if var.persistable and var.type != core.VarDesc.VarType.RAW
-            ]
-        ])
-
-        def place_obj(place):
-            p = core.Place()
-            p.set_place(place)
-            return p
-
-        places = list(map(place_obj, self._places))

-        # step7: init ParallelExecutor
-        # ParallelExecutor API will be deprecated, don't support parallel graph.
-        self._graph = core.Graph(main.desc)
+        self._places = compiler.get_available_places(use_cuda)
+        self._scope = scope if scope is not None else executor.global_scope()

-        self.executor = core.ParallelExecutor(
-            places, persistable_vars,
-            cpt.to_text(loss_name) if loss_name else six.u(''), scope,
-            local_scopes, exec_strategy, build_strategy, self._graph)
+        main_program = main_program if main_program is not None \
+            else framework.default_main_program()

-        self.scope = scope
+        self._compiled_program = compiler.CompiledProgram(main_program)
+        self._compiled_program.with_data_parallel(
+            loss_name=loss_name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy,
+            share_vars_from=share_vars_from)
+        self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
+        self._executor = executor.Executor(self._place)
+        self._compiled_program._compile(place=self._place, scope=self._scope)

    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
        """
@@ -256,56 +180,11 @@ class ParallelExecutor(object):
                loss = pe.run(feed=feeder.feed(cur_batch),
                              fetch_list=[avg_cost.name]))
        """
-        if feed is None and feed_dict is not None:
-            feed = feed_dict
-            print(
-                "`feed_dict` is deprecated. Please use `feed=`",
-                file=sys.stderr)
-
-        if isinstance(feed, dict):
-            feed_tensor_dict = dict()
-            for feed_name in feed:
-                feed_tensor = feed[feed_name]
-                if not isinstance(feed_tensor, core.LoDTensor):
-                    feed_tensor = core.LoDTensor()
-                    # always set to CPU place, since the tensor need to be splitted
-                    # it is fast in CPU
-                    feed_tensor.set(feed[feed_name], core.CPUPlace())
-                feed_tensor_dict[feed_name] = feed_tensor
-
-            self.executor.feed_and_split_tensor_into_local_scopes(
-                feed_tensor_dict)
-        elif isinstance(feed, list) or isinstance(feed, tuple):
-            if len(feed) != len(self._places):
-                raise ValueError(
-                    "Feed a list of tensor, the list should be the same size as places"
-                )
-
-            res = list()
-
-            for i, each in enumerate(feed):
-                if not isinstance(each, dict):
-                    raise TypeError(
-                        "Each element of feed list should be a dict")
-                res_dict = dict()
-                for feed_name in each:
-                    tensor = each[feed_name]
-                    if not isinstance(tensor, core.LoDTensor):
-                        tmp = core.LoDTensor()
-                        tmp.set(tensor, self._places[i])
-                        tensor = tmp
-                    res_dict[feed_name] = tensor
-                res.append(res_dict)
-            self.executor.feed_tensors_into_local_scopes(res)
-
-        fetch_var_name = 'fetch'
-        self.executor.run(fetch_list, fetch_var_name)
-        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
-
-        if return_numpy:
-            return executor.as_numpy(arr)
-
-        return [arr[i] for i in range(len(arr))]
+        return self._executor.run(program=self._compiled_program,
+                                  scope=self._scope,
+                                  feed=feed,
+                                  fetch_list=fetch_list,
+                                  return_numpy=return_numpy)

    @property
    def device_count(self):

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -15,44 +15,139 @@
 from __future__ import print_function

 import unittest
+import numpy as np

-from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp


-class TestMKLDNN(TestConv2dOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
+def conv2d_bias_naive(out, bias):
+    _, out_c, _, _ = out.shape

+    for l in range(out_c):
+        out[:, l, :, :] = out[:, l, :, :] + bias[l]
+    return out

-class TestMKLDNNWithPad(TestWithPad):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"

+def conv2d_residual_naive(out, residual):
+    assert out.shape == residual.shape
+    out = np.add(out, residual)
+    return out

-class TestMKLDNNWithStride(TestWithStride):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"

+class TestConv2dMKLDNNOp(TestConv2dOp):
+    def init_group(self):
+        self.groups = 1

-class TestMKLDNNWithGroup(TestWithGroup):
    def init_kernel_type(self):
-        self.use_mkldnn = True
        self.data_format = "NCHW"
+        self.use_mkldnn = True
+        self._cpu_only = True

+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]

-class TestMKLDNNWith1x1(TestWith1x1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
+    def setUp(self):
+        self.fuse_bias = False
+        self.bias_size = None
+        self.fuse_relu = False
+        self.fuse_residual_connection = False
+        self.input_residual_size = None
+        TestConv2dOp.setUp(self)

+        output = self.outputs['Output']

-class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
+        #mkldnn only support either conv-sum-relu, or conv-relu.
+        if self.fuse_bias and self.bias_size is not None:
+            bias = np.random.random(self.bias_size).astype(self.dtype)
+            output = conv2d_bias_naive(output, bias)
+            output = output.astype(self.dtype)
+            self.attrs['fuse_bias'] = self.fuse_bias
+            self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
+
+        if self.fuse_residual_connection and self.input_residual_size is not None:
+            input_residual = np.random.random(self.input_residual_size).astype(
+                self.dtype)
+            output = conv2d_residual_naive(output, input_residual)
+
+            self.attrs[
+                'fuse_residual_connection'] = self.fuse_residual_connection
+            self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
+                input_residual)
+
+        if self.fuse_relu:
+            output = np.maximum(output, 0).astype(self.dsttype)
+
+        output = output.astype(self.dtype)
+
+        self.attrs['fuse_bias'] = self.fuse_bias
+        self.attrs['fuse_relu'] = self.fuse_relu
+        self.attrs['fuse_residual_connection'] = self.fuse_residual_connection
+
+        self.outputs['Output'] = output
+
+
+class TestWithFuse(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.fuse_bias = True
+        self.bias_size = [6]
+        self.fuse_residual_connection = True
+        self.input_residual_size = [2, 6, 5, 5]
+
+    def test_check_grad(self):
+        pass
+
+    def test_check_grad_no_filter(self):
+        pass
+
+    def test_check_grad_no_input(self):
+        pass
+
+
+class TestWithPadWithBias(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.input_size = [2, 3, 6, 6]
+
+
+class TestWithStride(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]
+
+
+class TestWithGroup(TestConv2dMKLDNNOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWith1x1(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.filter_size = [6, 3, 1, 1]
+
+
+class TestWithInput1x1Filter1x1(TestConv2dMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dMKLDNNOp.init_test_case(self)
+        self.input_size = [2, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -15,36 +15,22 @@
 from __future__ import print_function

 import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest

-from paddle.fluid.tests.unittests.test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride
+from paddle.fluid.tests.unittests.test_conv2d_transpose_op import conv2dtranspose_forward_naive, TestConv2dTransposeOp


-class TestMKLDNN(TestConv2dTransposeOp):
-    def init_op_type(self):
-        self.is_test = True
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
-        self.op_type = "conv2d_transpose"
-        self._cpu_only = True
-
-    def test_check_grad(self):
-        return
+def conv2d_bias_naive(out, bias):
+    _, out_c, _, _ = out.shape

-    def test_check_grad_no_input(self):
-        return
-
-    def test_check_grad_no_filter(self):
-        return
+    for l in range(out_c):
+        out[:, l, :, :] = out[:, l, :, :] + bias[l]
+    return out


-class TestMKLDNNWithPad(TestWithPad):
-    def init_op_type(self):
-        self.is_test = True
-        self.use_mkldnn = True
-        self.data_format = "NCHW"
-        self.op_type = "conv2d_transpose"
-        self._cpu_only = True
-
+class TestConv2dTransposeMKLDNNOp(TestConv2dTransposeOp):
    def test_check_grad(self):
        return

@@ -54,24 +40,64 @@ class TestMKLDNNWithPad(TestWithPad):
    def test_check_grad_no_filter(self):
        return

-
-class TestMKLDNNWithStride(TestWithStride):
    def init_op_type(self):
-        self.is_test = True
-        self.use_mkldnn = True
        self.data_format = "NCHW"
        self.op_type = "conv2d_transpose"
        self._cpu_only = True

-    def test_check_grad(self):
-        return
-
-    def test_check_grad_no_input(self):
-        return
-
-    def test_check_grad_no_filter(self):
-        return
-
-
-if __name__ == '__main__':
-    unittest.main()
+    def init_test_case(self):
+        self.use_mkldnn = True
+        self.is_test = True
+        self.pad = [0, 0]
+        self.fuse_bias = False
+        self.bias_size = None
+        self.fuse_relu = False
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+        self.groups = 1
+
+    def setUp(self):
+        TestConv2dTransposeOp.setUp(self)
+
+        output = self.outputs['Output']
+
+        if self.fuse_bias and self.bias_size is not None:
+            bias = np.random.random(self.bias_size).astype(self.dtype)
+            output = conv2d_bias_naive(output, bias)
+            output = output.astype(self.dtype)
+            self.attrs['fuse_bias'] = self.fuse_bias
+            self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
+
+        if self.fuse_relu:
+            output = np.maximum(output, 0).astype(self.dtype)
+
+        self.attrs['fuse_bias'] = self.fuse_bias
+        self.attrs['fuse_relu'] = self.fuse_relu
+
+        self.outputs['Output'] = output
+
+
+class TestMKLDNNFuseBias(TestConv2dTransposeMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.fuse_bias = True
+        self.bias_size = [6]
+
+
+class TestMKLDNNWithPad(TestConv2dTransposeMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.input_size = [2, 3, 10, 10]
+
+
+class TestMKLDNNWithStride(TestConv2dTransposeMKLDNNOp):
+    def init_test_case(self):
+        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
@@ -18,6 +18,24 @@ import unittest
 from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5


+def create_test_mkldnn_use_ceil_class(parent):
+    class TestMKLDNNPool2DUseCeilCase(parent):
+        def init_kernel_type(self):
+            self.use_mkldnn = True
+
+        def init_ceil_mode(self):
+            self.ceil_mode = True
+
+    cls_name = "{0}_{1}".format(parent.__name__, "MKLDNNCeilModeCast")
+    TestMKLDNNPool2DUseCeilCase.__name__ = cls_name
+    globals()[cls_name] = TestMKLDNNPool2DUseCeilCase
+
+
+create_test_mkldnn_use_ceil_class(TestPool2D_Op)
+create_test_mkldnn_use_ceil_class(TestCase1)
+create_test_mkldnn_use_ceil_class(TestCase2)
+
+
 def create_test_mkldnn_class(parent):
    class TestMKLDNNCase(parent):
        def init_kernel_type(self):

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -115,6 +115,9 @@ class TestDistRunnerBase(object):
        strategy.allow_op_delay = False

        build_stra = fluid.BuildStrategy()
+        # FIXME force disable enable_inplace and memory_optimize
+        build_stra.enable_inplace = False
+        build_stra.memory_optimize = False

        if args.use_reduce:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce

--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
@@ -123,6 +123,9 @@ class TestMNIST(TestParallelExecutorBase):

        # NOTE(dzh):
        # need to make it compatible with elewise fuse act
+        # FIXME (liuwei12)
+        # the new memory optimize strategy will crash this unittest
+        # add enable_inplace=False here to force pass the unittest
        not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
            model,
            feed_dict={"image": img,
@@ -131,6 +134,7 @@ class TestMNIST(TestParallelExecutorBase):
            fuse_elewise_add_act_ops=False,
            memory_opt=False,
            use_ir_memory_optimize=False,
+            enable_inplace=False,
            optimizer=_optimizer)
        fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
            model,
@@ -140,6 +144,7 @@ class TestMNIST(TestParallelExecutorBase):
            fuse_elewise_add_act_ops=True,
            memory_opt=False,
            use_ir_memory_optimize=False,
+            enable_inplace=False,
            optimizer=_optimizer)

        for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):

--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# nlp model stack of op operate on lod. It's a classical test case in optimize pass.
+
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+import unittest
+import paddle.fluid.core as core
+
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.executor import Executor
+from paddle.fluid.backward import append_backward
+from paddle.fluid.optimizer import MomentumOptimizer
+from ir_memory_optimize_net_base import TestIrMemOptBase
+
+
+class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
+    def check_network_convergence(self, use_cuda=True, py_opt=False,
+                                  iter_num=5):
+        prog = Program()
+        startup_prog = Program()
+        prog.random_seed = 100
+        startup_prog.random_seed = 100
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')
+
+            label = layers.data(name='y', shape=[1], dtype='int64')
+
+            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
+            cond = layers.less_than(x=label, y=limit)
+            ie = layers.IfElse(cond)
+
+            with ie.true_block():
+                true_image = ie.input(image)
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            with ie.false_block():
+                false_image = ie.input(image)
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            prob = ie()
+            loss = layers.cross_entropy(input=prob[0], label=label)
+            avg_loss = layers.mean(loss)
+
+            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+            optimizer.minimize(avg_loss, startup_prog)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=200)
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = Executor(place)
+
+            exec_strategy = fluid.ExecutionStrategy()
+            exec_strategy.use_cuda = use_cuda
+
+            if py_opt:
+                fluid.memory_optimize(fluid.default_main_program())
+            train_cp = compiler.CompiledProgram(fluid.default_main_program())
+            train_cp = train_cp.with_data_parallel(
+                loss_name=avg_loss.name, exec_strategy=exec_strategy)
+            fetch_list = [avg_loss.name]
+
+            exe.run(startup_prog)
+            PASS_NUM = 100
+            loop = 0
+            ret = []
+            for pass_id in range(PASS_NUM):
+                for data in train_reader():
+                    x_data = np.array([x[0] for x in data]).astype("float32")
+                    y_data = np.array([x[1] for x in data]).astype("int64")
+                    y_data = y_data.reshape((y_data.shape[0], 1))
+
+                    outs = exe.run(train_cp,
+                                   feed={'x': x_data,
+                                         'y': y_data},
+                                   fetch_list=[avg_loss])
+
+                    loop += 1
+                    ret.append(outs[0])
+                    if iter_num == loop:
+                        return ret
+            return ret
+
+    def test_ifelse(self):
+        ret1 = self.check_network_convergence(False, True)
+        print(ret1)
+        ret2 = self.check_network_convergence(False, False)
+        print(ret2)
+        self.assertTrue(np.allclose(ret1, ret2))
+
+        if fluid.core.is_compiled_with_cuda():
+            ret1 = self.check_network_convergence(True, True)
+            print(ret1)
+            ret2 = self.check_network_convergence(True, False)
+            print(ret2)
+            self.assertTrue(np.allclose(ret1, ret2))
+            #self.assertEqual(ret1, ret2)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1035,6 +1035,19 @@ class TestBook(unittest.TestCase):

        print(str(program))

+    def test_spectral_norm(self):
+        program = Program()
+        with program_guard(program):
+            weight = layers.data(
+                name='weight',
+                shape=[2, 3, 32, 32],
+                dtype="float32",
+                append_batch_size=False)
+            out = layers.spectral_norm(weight, dim=1, power_iters=1)
+            self.assertIsNotNone(out)
+
+        print(str(program))
+
    def test_shuffle_channel(self):
        program = Program()
        with program_guard(program):

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -59,8 +59,12 @@ class TestFetchAndFeed(unittest.TestCase):
        exe = fluid.Executor(place)
        exe.run(startup)

+        #FIXME force disable enable_inplace and memory_optimize to pass the unittest
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.enable_inplace = False
+        build_strategy.memory_optimize = False
        train_cp = compiler.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name)
+            loss_name=loss.name, build_strategy=build_strategy)

        run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)


--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -96,6 +96,9 @@ class TestPassBuilder(unittest.TestCase):
        build_strategy = fluid.BuildStrategy()
        self.assertFalse(build_strategy.fuse_elewise_add_act_ops)
        build_strategy.fuse_elewise_add_act_ops = True
+        #FIXME: currently fuse_elewise_add_act_ops not compatible with below options
+        build_strategy.enable_inplace = False
+        build_strategy.memory_optimize = False
        pass_builder = build_strategy._finalize_strategy_and_create_passes()
        self.assertTrue("fuse_elewise_add_act_pass" in
                        [p.type() for p in pass_builder.all_passes()])

--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -142,6 +142,10 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
            exe = fluid.Executor(place)
            exe.run(fluid.default_startup_program())

+            #FIXME force use old memory optimzie strategy here to pass the unittest
+            #since open the new strategy will crash the unittest
+            fluid.memory_optimize(fluid.default_main_program())
+
            train_cp = compiler.CompiledProgram(fluid.default_main_program())
            if use_parallel_executor:
                train_cp = train_cp.with_data_parallel(loss_name=loss.name)

--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
@@ -49,6 +49,21 @@ class TestSequenceEraseOpInt32(OpTest):
        self.check_output()


+class TestSequenceEraseOpInt32LoD2(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        lod = [[1, 3], [9, 4, 11, 6]]
+        tokens = [2, 3, 5]
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[-1], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, lod[:-1] + [new_lod0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestSequenceEraseOpInt64(OpTest):
    def setUp(self):
        self.op_type = "sequence_erase"

--- a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+from paddle.fluid import core
+
+
+def spectral_norm(weight, u, v, dim, power_iters, eps):
+    shape = weight.shape
+    weight_mat = weight.copy()
+    h = shape[dim]
+    w = np.prod(shape) // h
+    if dim != 0:
+        perm = [dim] + [d for d in range(len(shape)) if d != dim]
+        weight_mat = weight_mat.transpose(perm)
+    weight_mat = weight_mat.reshape((h, w))
+
+    u = u.reshape((h, 1))
+    v = v.reshape((w, 1))
+    for i in range(power_iters):
+        v = np.matmul(weight_mat.T, u)
+        v_norm = np.sqrt((v * v).sum())
+        v = v / (v_norm + eps)
+        u = np.matmul(weight_mat, v)
+        u_norm = np.sqrt((u * u).sum())
+        u = u / (u_norm + eps)
+
+    sigma = (u * np.matmul(weight_mat, v)).sum()
+    return weight / sigma
+
+
+class TestSpectralNormOpNoGrad(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'spectral_norm'
+        weight = np.random.random(self.weight_shape).astype('float32')
+        u = np.random.normal(0., 1., self.u_shape).astype('float32')
+        v = np.random.normal(0., 1., self.v_shape).astype('float32')
+
+        self.attrs = {
+            "dim": self.dim,
+            "power_iters": self.power_iters,
+            "eps": self.eps,
+        }
+
+        self.inputs = {
+            "Weight": weight,
+            "U": u,
+            "V": v,
+        }
+
+        output = spectral_norm(weight, u, v, self.dim, self.power_iters,
+                               self.eps)
+        self.outputs = {"Out": output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def initTestCase(self):
+        self.weight_shape = (2, 3)
+        self.u_shape = (2, )
+        self.v_shape = (3, )
+        self.dim = 0
+        self.power_iters = 5
+        self.eps = 1e-12
+
+
+class TestSpectralNormOpNoGrad2(TestSpectralNormOpNoGrad):
+    def initTestCase(self):
+        self.weight_shape = (2, 3, 3, 3)
+        self.u_shape = (3, )
+        self.v_shape = (18, )
+        self.dim = 1
+        self.power_iters = 10
+        self.eps = 1e-12
+
+
+class TestSpectralNormOp(TestSpectralNormOpNoGrad):
+    def test_check_grad_ignore_uv(self):
+        self.check_grad(
+            ['Weight'],
+            'Out',
+            no_grad_set=set(["U", "V"]),
+            max_relative_error=0.1)
+
+    def initTestCase(self):
+        self.weight_shape = (2, 3)
+        self.u_shape = (2, )
+        self.v_shape = (3, )
+        self.dim = 0
+        self.power_iters = 0
+        self.eps = 1e-12
+
+
+class TestSpectralNormOp2(TestSpectralNormOp):
+    def initTestCase(self):
+        self.weight_shape = (2, 3, 3, 3)
+        self.u_shape = (3, )
+        self.v_shape = (18, )
+        self.dim = 1
+        self.power_iters = 0
+        self.eps = 1e-12
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tools/check_doc_approval.py
+++ b/tools/check_doc_approval.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import ast
-import hashlib
-import importlib
-import paddle.fluid
-
-files = [
-    "paddle.fluid", "paddle.fluid.average", "paddle.fluid.backward",
-    "paddle.fluid.clip", "paddle.fluid.data_feeder", "paddle.fluid.executor",
-    "paddle.fluid.initializer", "paddle.fluid.io", "paddle.fluid.layers",
-    "paddle.fluid.metrics", "paddle.fluid.nets", "paddle.fluid.optimizer",
-    "paddle.fluid.profiler", "paddle.fluid.recordio_writer",
-    "paddle.fluid.regularizer", "paddle.fluid.transpiler"
-]
-
-
-def md5(doc):
-    hash = hashlib.md5()
-    hash.update(str(doc))
-    return hash.hexdigest()
-
-
-def get_module():
-    for fi in files:
-        fi_lib = importlib.import_module(fi)
-        doc_function = getattr(fi_lib, "__all__")
-        for api in doc_function:
-            api_name = fi + "." + api
-            try:
-                doc_module = getattr(eval(api_name), "__doc__")
-            except:
-                pass
-            doc_md5_code = md5(doc_module)
-            doc_dict[api_name] = doc_md5_code
-
-
-def doc_md5_dict(doc_md5_path):
-    with open(doc_md5_path, "rb") as f:
-        doc_md5 = f.read()
-        doc_md5_dict = ast.literal_eval(doc_md5)
-    return doc_md5_dict
-
-
-def check_doc_md5():
-    for k, v in doc_dict.items():
-        try:
-            if doc_ci_dict[k] != v:
-                return doc_dict
-        except:
-            return doc_dict
-    return True
-
-
-if __name__ == "__main__":
-    doc_dict = {}
-    doc_ci_dict = {}
-    doc_md5_file = "/root/.cache/doc_md5.txt"
-    if not os.path.exists(doc_md5_file):
-        os.mknod(doc_md5_file)
-    else:
-        doc_ci_dict = doc_md5_dict(doc_md5_file)
-    get_module()
-    if not os.path.getsize(doc_md5_file):
-        with open(doc_md5_file, 'w') as f:
-            f.write(str(doc_dict))
-        check_dic = True
-        print(check_dic)
-    else:
-        check_dic = check_doc_md5()
-        print(check_dic)
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -26,4 +26,10 @@ for each_diff in result:
        print(each_diff)

 if error:
+    print(
+        '''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
+    1. cd ${paddle_path}, compile paddle;
+    2. pip install build/python/dist/(build whl package);
+    3. run "python tools/print_signatures.py paddle.fluid, paddle.reader > paddle/fluid/API.spec"'''
+    )
    sys.exit(1)
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -24,12 +24,19 @@ import inspect
 import collections
 import sys
 import pydoc
+import hashlib

 member_dict = collections.OrderedDict()

 experimental_namespace = {"paddle.fluid.imperative"}


+def md5(doc):
+    hash = hashlib.md5()
+    hash.update(str(doc).encode('utf-8'))
+    return hash.hexdigest()
+
+
 def visit_member(parent_name, member):
    cur_name = ".".join([parent_name, member.__name__])
    if inspect.isclass(member):
@@ -39,7 +46,10 @@ def visit_member(parent_name, member):
                visit_member(cur_name, value)
    elif callable(member):
        try:
-            member_dict[cur_name] = inspect.getargspec(member)
+            doc = ('document', md5(member.__doc__))
+            args = inspect.getargspec(member)
+            all = (args, doc)
+            member_dict[cur_name] = all
        except TypeError:  # special for PyBind method
            member_dict[cur_name] = "  ".join([
                line.strip() for line in pydoc.render_doc(member).split('\n')