Merge branch 'develop' of https://github.com/paddlepaddle/paddle into HEAD

3df7b98a · nhzlx · f3a2e4b3 · 33965527 · 3df7b98a · 3df7b98a
29 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -56,7 +56,7 @@ paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_pr
 paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '0a5308f496632ab1ec3ba1f1377e6f95'))
 paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '41779819cef32f2246e83aebc5a002e2'))
 paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2'))
-paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '582d87b8df75a5a639a107db8ff86f9c'))
+paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '70f4f53f13572436ac72d1c8b5efeb9d'))
 paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb'))
 paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -109,7 +109,7 @@ paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name
 paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3'))
 paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce'))
 paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6'))
-paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'dc7042734c6d8b8ce97321f017f01d6f'))
+paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'f1dd22f7351f7f9853212958e0d8aa7a'))
 paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6'))
 paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2'))
 paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571'))
@@ -205,7 +205,7 @@ paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None,
 paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f207ae10589ebe38a63575ef6ff8e1e'))
 paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed'))
 paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f'))
-paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)), ('document', '2f46f1ff39a13ab00857e7b9f44b2fa7'))
+paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', 'ab84fdc6dc60f3ad9aa397e6007e3bf9'))
 paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '70e3b5182a18b40b47ecabd7c8490a35'))
 paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '9bb77f8dc002dd2ce75d4769eaaf5007'))
 paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d'))

--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -112,19 +112,20 @@ void FusedAllReduceOpHandle::RunImpl() {
        });
    for (size_t k = 1; k < g_tensor.size(); ++k) {
-      const void *pre_address = g_tensor.at(k - 1).second->data<void>();
+      const void *cur_address = g_tensor.at(k - 1).second->data<void>();
      int64_t len = g_tensor.at(k - 1).second->numel();
      auto offset = len * framework::SizeOfType(dtype);
-      void *next_address = reinterpret_cast<void *>(
+      void *infer_next_address = reinterpret_cast<void *>(
-          reinterpret_cast<uintptr_t>(pre_address) + offset);
+          reinterpret_cast<uintptr_t>(cur_address) + offset);
-      const void *cur_address = g_tensor.at(k).second->data<void>();
+      const void *next_address = g_tensor.at(k).second->data<void>();
-      VLOG(10) << k << ", "
-               << " pre_address(" << g_tensor.at(k - 1).first
+      VLOG(10) << string::Sprintf(
-               << "): " << pre_address << ", cur_address("
+          "Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer "
-               << g_tensor.at(k).first << "): " << cur_address
+          "input[%d] address: 0X%02x. The offset: %d",
-               << ", offset:" << offset << ", " << next_address << ", "
+          k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
-               << cur_address;
+          next_address, k, infer_next_address, offset);
-      PADDLE_ENFORCE_EQ(next_address, cur_address);
+      PADDLE_ENFORCE_EQ(infer_next_address, next_address,
+                        "The address is not consistent.");
    }
  }

--- a/paddle/fluid/framework/ir/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
@@ -224,8 +224,8 @@ std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl(
  PADDLE_ENFORCE(param_scope());
+  QuantizeConv(graph.get(), false /* with_residual_data */);
  QuantizeConv(graph.get(), true /* with_residual_data */);
-  QuantizeConv(graph.get());
  QuantizePool(graph.get());
  return graph;

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -599,10 +599,19 @@ bool VarLinksToOp(Node *node, const std::string &op_type) {
 bool IsNthInput(Node *var, Node *op, const std::string &argument, size_t nth) {
  PADDLE_ENFORCE(var->IsVar());
  PADDLE_ENFORCE(op->IsOp());
-  if (op->Op()->Input(argument).size() <= nth) return false;
+  if (!HasInput(op, argument) || op->Op()->Input(argument).size() <= nth)
+    return false;
  return var->Name() == op->Op()->Input(argument)[nth];
 }
+bool HasInput(Node *op, const std::string &argument) {
+  PADDLE_ENFORCE(op->IsOp());
+  auto const &names = op->Op()->InputNames();
+  if (std::find(names.begin(), names.end(), argument) == names.end())
+    return false;
+  return true;
+}
 bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) {
  PADDLE_ENFORCE(var->IsVar());
  PADDLE_ENFORCE(op->IsOp());
@@ -1082,8 +1091,15 @@ PDNode *patterns::Conv::operator()() {
 PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-  if (!with_residual_data)
+  if (!with_residual_data) {
-    conv_op->assert_op_attr("fuse_residual_connection", false);
+    conv_op->assert_more([&](Node *x) {
+      auto node_names = x->Op()->InputNames();
+      if (!HasInput(x, "ResidualData") ||
+          x->Op()->Input("ResidualData").size() == 0)
+        return true;
+      return false;
+    });
+  }
  auto input_var = pattern->NewNode(conv_input_repr())
                       ->AsInput()

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -305,6 +305,9 @@ bool VarLinksFromOp(Node* node, const std::string& op_type);
 // Check whether a var node is a op node's nth input.
 bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);
+// Check whether the op node has input of given name.
+bool HasInput(Node* op, const std::string& argument);
 // Tell whether a var node is a op node's nth output.
 bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);

--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
@@ -14,12 +14,16 @@ limitations under the License. */
 #pragma once
+#include <memory>
 #include "paddle/fluid/framework/ir/pass.h"
 namespace paddle {
 namespace framework {
 namespace ir {
+/*
+ * Specifies which operators should use MKLDNN.
+ */
 class MKLDNNPlacementPass : public Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -136,6 +136,15 @@ struct Argument {
  // Pass a set of op types to enable its mkldnn kernel
  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                      std::unordered_set<std::string>);
+  // A set of op types to enable their quantized kernels
+  DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes,
+                      std::unordered_set<std::string>);
+  // A set of op IDs to exclude from enabling their quantized kernels
+  DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, QuantizeExcludedOpIds,
+                      std::unordered_set<int>);
  // Scales for variables to be quantized
  DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -64,6 +64,13 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(
                    argument->mkldnn_enabled_op_types()));
+    } else if (pass_name == "cpu_quantize_placement_pass") {
+      pass->Set("quantize_enabled_op_types",
+                new std::unordered_set<std::string>(
+                    argument->quantize_enabled_op_types()));
+      pass->Set(
+          "quantize_excluded_op_ids",
+          new std::unordered_set<int>(argument->quantize_excluded_op_ids()));
    } else if (pass_name == "cpu_quantize_pass") {
      pass->Set("quant_var_scales",
                new VarQuantScale(argument->quant_var_scales()));

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -107,6 +107,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
  use_gpu_ = true;
 }
+void GpuPassStrategy::EnableQuantizer() {
+  LOG(ERROR) << "GPU not support quantization yet";
+}
 void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
  analysis_passes_.push_back(pass);
 }

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -85,6 +85,10 @@ class PassStrategy : public PaddlePassBuilder {
   */
  virtual void EnableMKLDNN() {}
+  /** Enable quantize optimization
+   */
+  virtual void EnableQuantizer() {}
  bool use_gpu() const { return use_gpu_; }
  virtual ~PassStrategy() = default;
@@ -125,6 +129,16 @@ class CpuPassStrategy : public PassStrategy {
    use_mkldnn_ = false;
 #endif
  }
+  void EnableQuantizer() override {
+    if (!use_quantizer_) {
+      passes_.push_back("cpu_quantize_placement_pass");
+    }
+    use_quantizer_ = true;
+  }
+ protected:
+  bool use_quantizer_{false};
 };
 /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
@@ -139,6 +153,7 @@ class GpuPassStrategy : public PassStrategy {
  }
  void EnableMKLDNN() override;
+  void EnableQuantizer() override;
  virtual ~GpuPassStrategy() = default;
 };

--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
 cc_library(benchmark SRCS benchmark.cc DEPS enforce)
 cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
-cc_binary(visualizer SRCS visualizer.cc DEPS analysis
-    paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
--- a/paddle/fluid/inference/utils/visualizer.cc
+++ b/paddle/fluid/inference/utils/visualizer.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/inference/utils/visualizer.h"
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <fstream>
-#include <memory>
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
-#include "paddle/fluid/platform/init.h"
-DEFINE_string(model_dir, "", "model directory");
-DEFINE_string(model_program_path, "", "model program path");
-DEFINE_string(model_params_path, "", "model params path");
-using paddle::inference::analysis::Argument;
-namespace paddle {
-namespace inference {
-namespace utils {
-void Visualizer::SetArgument(Argument *argument) { argument_ = argument; }
-bool Visualizer::Run() {
-  paddle::framework::InitDevices(false);
-  paddle::inference::analysis::Analyzer().Run(argument_);
-  return true;
-}
-}  // namespace utils
-}  // namespace inference
-}  // namespace paddle
-// Generate a dot file describing the structure of graph.
-// To use this tool, run command: ./visualizer [options...]
-// Options:
-//     --model_dir: the directory of model
-//     --model_program_path: the path of program
-//     --model_params_path: the path of params
-int main(int argc, char *argv[]) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  google::InitGoogleLogging(argv[0]);
-  paddle::inference::analysis::Argument argument;
-  argument.SetUseGPU(false);
-  argument.SetUseTensorRT(false);
-  if (FLAGS_model_dir.empty()) {
-    if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) {
-      LOG(ERROR) << "Please set model_dir"
-                    " or model_program_path and model_params_path";
-      return -1;
-    } else {
-      argument.SetModelProgramPath(FLAGS_model_program_path);
-      argument.SetModelParamsPath(FLAGS_model_params_path);
-    }
-  } else {
-    argument.SetModelDir(FLAGS_model_dir);
-  }
-  // Only 1 pass, default filename is 0_ir_origin.dot
-  // For more details, looking for paddle::inference::analysis::IRPassManager
-  argument.SetIrAnalysisPasses({"infer_clean_graph_pass", "graph_viz_pass"});
-  std::unique_ptr<paddle::framework::Scope> scope{
-      new paddle::framework::Scope()};
-  argument.SetScopeNotOwned(
-      const_cast<paddle::framework::Scope *>(scope.get()));
-  paddle::inference::utils::Visualizer visualizer;
-  visualizer.SetArgument(&argument);
-  visualizer.Run();
-  return 0;
-}
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(graph_to_program_pass);
--- a/paddle/fluid/inference/utils/visualizer.h
+++ b/paddle/fluid/inference/utils/visualizer.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <string>
-#include "paddle/fluid/inference/analysis/argument.h"
-namespace paddle {
-namespace inference {
-namespace utils {
-using paddle::inference::analysis::Argument;
-class Visualizer final {
- public:
-  Visualizer() = default;
-  ~Visualizer() = default;
-  Visualizer(const Visualizer &) = delete;
-  Visualizer &operator=(const Visualizer &) = delete;
-  void SetArgument(Argument *);
-  bool Run();
- private:
-  Argument *argument_;
-};
-}  // namespace utils
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -76,12 +76,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                      const std::string& name) {
  framework::LibraryType library{framework::LibraryType::kPlain};
  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-#ifdef PADDLE_WITH_CUDA
+// FIXME(liuwei1031) temporarily disable the code to unblock users
-  auto it1 = oper.Attrs().find("use_cudnn");
+// TODO(liuwei1031) figure out the reason behind
-  if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) {
+// https://github.com/PaddlePaddle/Paddle/issues/16096
-    library = framework::LibraryType::kCUDNN;
+// and re-enable this in the future
-  }
+// #ifdef PADDLE_WITH_CUDA
-#endif
+//   auto it1 = oper.Attrs().find("use_cudnn");
+//   if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) {
+//     library = framework::LibraryType::kCUDNN;
+//   }
+// #endif
 #ifdef PADDLE_WITH_MKLDNN
  auto it = oper.Attrs().find("use_mkldnn");
  if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&

--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -67,6 +67,22 @@ class AffineChannelOp : public framework::OperatorWithKernel {
                   "Input(Bias) of AffineChannelOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of AffineChannelOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto scale_dims = ctx->GetInputDim("Scale");
+    auto b_dims = ctx->GetInputDim("Bias");
+    const framework::DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+    const int64_t C = (data_layout == framework::DataLayout::kNCHW
+                           ? x_dims[1]
+                           : x_dims[x_dims.size() - 1]);
+    PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL);
+    PADDLE_ENFORCE_EQ(scale_dims[0], C);
+    PADDLE_ENFORCE_EQ(b_dims.size(), 1UL);
+    PADDLE_ENFORCE_EQ(b_dims[0], C);
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
    ctx->ShareLoD("X", "Out");
  }
@@ -97,6 +113,27 @@ class AffineChannelOpGrad : public framework::OperatorWithKernel {
  }
 };
+class AffineChannelGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("affine_channel_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("Scale", Input("Scale"));
+    op->SetAttrMap(Attrs());
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
 template <typename T>
 using EigenArrayMap =
    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
@@ -244,8 +281,7 @@ namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp,
-                  ops::AffineChannelOpMaker,
+                  ops::AffineChannelOpMaker, ops::AffineChannelGradMaker);
-                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad);
 REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel<CPU, float>,

--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -71,7 +71,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
        "1. downgrade_in_infer(default), downgrade the outcome at inference "
        "time"
        "   train: out = input * mask"
-        "   inference: out = input * dropout_prob"
+        "   inference: out = input * (1.0 - dropout_prob)"
        "2. upscale_in_train, upscale the outcome at training time, do nothing "
        "in inference"
        "   train: out = input * mask / ( 1.0 - dropout_prob )"

--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -32,7 +32,10 @@ class LoDResetOp : public framework::OperatorWithKernel {
      PADDLE_ENFORCE_GT(level0.size(), 1,
                        "If Input(Y) not provided, the target lod should be "
                        "specified by attribute `target_lod`.");
+    } else {
+      ctx->ShareLoD("Y", "Out");
    }
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
  }

--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -78,12 +78,6 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                   "The numel of 'pad_value' can only be 1 or be equal to the "
                   "'step_width'.");
-    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
-      TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor);
-      pad_tensor->Resize(pad_tensor_dims);
-      return;
-    }
    const int kBlockSize = 512;
    /* At least use 32 threads to copy sequence_width elements,
@@ -129,12 +123,13 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
    CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
              step_width, layout);
+    /*
    if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
      TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor);
      seq_tensor->Resize(seq_tensor_dims);
      return;
    }
+    */
    const int kBlockSize = 512;

--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -290,8 +290,10 @@ class MatMulOp : public framework::OperatorWithKernel {
                                     context->Attrs().Get<bool>("transpose_Y"));
    PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_);
+    if (context->IsRuntime()) {
      PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
                     mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0);
+    }
    std::vector<int64_t> dim_out;
    if (mat_dim_x.batch_size_ != 0) {
      dim_out = framework::vectorize(dim_x);

--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -40,7 +40,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
                        "tensor's rank.");
    }
-    auto out_dims = GetOutputShape(axes, x_dims);
+    auto out_dims = GetOutputShape(axes, x_dims, false);
    ctx->SetOutputDim("Out", out_dims);
    if (x_dims[0] == out_dims[0]) {
      // Only pass LoD when the first dimension of output and Input(X)
@@ -50,7 +50,8 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
  }
  static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
-                                        const framework::DDim &in_dims) {
+                                        const framework::DDim &in_dims,
+                                        bool is_runtime) {
    size_t num_squeeze_dims = squeeze_dims.size();
    int cnt_squeezed_dims = 0;
    bool should_squeeze[9] = {false};
@@ -71,9 +72,12 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
        // Check current index, the upper limit has beed checked in line 36.
        PADDLE_ENFORCE(current >= 0,
                       "Invalid axis, the negative axis is out of range.");
+        if (is_runtime) {
          PADDLE_ENFORCE(in_dims[current] == 1,
                         "Invalid axis index, the axis that will be squeezed "
                         "should be equal to 1.");
+        }
        if (!(should_squeeze[current])) {
          ++cnt_squeezed_dims;
@@ -104,7 +108,7 @@ class SqueezeOp : public framework::OperatorBase {
               const platform::Place &place) const override {
    auto &axes = Attr<std::vector<int>>("axes");
    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
-    auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims);
+    auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims, true);
    framework::AttributeMap attrs;
    attrs["shape"] = framework::vectorize2int(out_dims);
@@ -224,7 +228,7 @@ class Squeeze2Op : public framework::OperatorBase {
               const platform::Place &place) const override {
    auto &axes = Attr<std::vector<int>>("axes");
    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
-    auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims);
+    auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims, true);
    framework::AttributeMap attrs;
    attrs["shape"] = framework::vectorize2int(out_dims);

--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -34,8 +34,11 @@ class TopkOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
    PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
+    if (ctx->IsRuntime()) {
      PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
                        "input must have >= k columns");
+    }
    framework::DDim dims = input_dims;
    dims[dims.size() - 1] = k;

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -23,7 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
+#if !defined(__APPLE__) && !defined(_WIN32)
 #include "paddle/fluid/platform/dynload/nccl.h"
+#endif
 #include "paddle/fluid/platform/gpu_info.h"
 #endif

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -453,6 +453,7 @@ function assert_api_spec_approvals() {
      echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
      if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
          # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
+          # approval_user_list: velconia 1979255,panyx0718 2887803,XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,typhoonzero 13348433,shanyi15 35982308. 
          if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695`
@@ -462,14 +463,14 @@ function assert_api_spec_approvals() {
            fi
          else
            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
          fi
          echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
          if [ "${APPROVALS}" == "FALSE" ]; then
            if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
              echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
            else
-              echo "You must have panyx0718 approval for the api change! ${API_FILE}"
+              echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
            fi
            exit 1
          fi
@@ -479,10 +480,10 @@ function assert_api_spec_approvals() {
    HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
    if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
        if [ "${APPROVALS}" == "FALSE" ]; then
-            echo "You must have panyx0718 approval for the const_cast"
+            echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
            exit 1
        fi
    fi

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -896,7 +896,7 @@ def save_inference_model(dirname,
                                     True is supported.
    Returns:
-        None
+        target_var_name_list(list): The fetch variables' name list
    Raises:
        ValueError: If `feed_var_names` is not a list of basestring.
@@ -949,11 +949,13 @@ def save_inference_model(dirname,
    # TODO(Superjomn) add an IR pass to remove 1-scale op.
    with program_guard(main_program):
        uniq_target_vars = []
-        for var in target_vars:
+        for i, var in enumerate(target_vars):
            if isinstance(var, Variable):
-                var1 = layers.scale(var, 1.)
+                var = layers.scale(
-            uniq_target_vars.append(var1)
+                    var, 1., name="save_infer_model/scale_{}".format(i))
+            uniq_target_vars.append(var)
        target_vars = uniq_target_vars
+    target_var_name_list = [var.name for var in target_vars]
    # when a pserver and a trainer running on the same machine, mkdir may conflict
    try:
@@ -1010,6 +1012,7 @@ def save_inference_model(dirname,
        params_filename = os.path.basename(params_filename)
    save_persistables(executor, dirname, main_program, params_filename)
+    return target_var_name_list
 def load_inference_model(dirname,

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1348,7 +1348,7 @@ def dropout(x,
                                        1. downgrade_in_infer(default), downgrade the outcome at inference
                                           - train: out = input * mask
-                                           - inference: out = input * dropout_prob
+                                           - inference: out = input * (1.0 - dropout_prob)
                                           (mask is a tensor same shape with input, value is 0 or 1
                                           ratio of 0 is dropout_prob)
@@ -4901,6 +4901,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
        if len(y_shape) > 2 and len(x_shape) > 2:
            for i, dim_x in enumerate(x_shape[:-2]):
+                # don't check neg shape
+                if dim_x < 0 or y_shape[i] < 0:
+                    continue
                if dim_x != y_shape[i]:
                    raise ValueError("Invalid inputs for matmul. x(%s), y(%s)" %
                                     (x.shape, y.shape))
@@ -9706,7 +9709,12 @@ def sequence_reverse(x, name=None):
    return out
-def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
+def affine_channel(x,
+                   scale=None,
+                   bias=None,
+                   data_layout='NCHW',
+                   name=None,
+                   act=None):
    """
    Applies a separate affine transformation to each channel of the input.
    Useful for replacing spatial batch norm with its equivalent fixed
@@ -9725,6 +9733,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
        data_layout (string, default NCHW): NCHW or NHWC. If input is 2D
            tensor, you can ignore data_layout.
        name (str, default None): The name of this layer.
+        act (str, default None): Activation to be applied to the output of this layer.
    Returns:
        out (Variable): A tensor of the same shape and data layout with x.
@@ -9744,7 +9753,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
                'Bias': bias},
        attrs={"data_layout": data_layout},
        outputs={"Out": out})
-    return out
+    return helper.append_activation(pre_activation)
 def similarity_focus(input, axis, indexes, name=None):

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -70,6 +70,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
+list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
@@ -97,6 +98,8 @@ if(WITH_DISTRIBUTE)
        set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
        py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
        set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
+        py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
+        set_tests_properties(test_dist_se_resnext_nccl PROPERTIES TIMEOUT 1000)
        # FIXME(typhoonzero): add these tests back
        # py_test_modules(test_dist_transformer MODULES test_dist_transformer)
        # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -110,7 +110,8 @@ class TestDistRunnerBase(object):
            trainer_prog = fluid.default_main_program()
        if args.use_cuda:
-            place = fluid.CUDAPlace(0)
+            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+            place = fluid.CUDAPlace(device_id)
        else:
            place = fluid.CPUPlace()
@@ -256,6 +257,7 @@ class TestDistBase(unittest.TestCase):
        self._dc_asgd = False  # must use with async mode
        self._use_reader_alloc = True
        self._nccl2_mode = False
+        self._mp_mode = False
        # FIXME(typhoonzero): I added this stupid argument to enable
        # testing allreduce layers, which users can call layers.allreduce
        # to accumulate tensors at anywhere. Find a better way to do this
@@ -504,6 +506,10 @@ class TestDistBase(unittest.TestCase):
            env0 = {'CPU_NUM': '1'}
            env1 = {'CPU_NUM': '1'}
+        if self._mp_mode:
+            env0 = {"FLAGS_selected_gpus": "0"}
+            env1 = {"FLAGS_selected_gpus": "1"}
        env0.update(envs)
        env1.update(envs)

--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+import os
+def skip_ci(func):
+    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
+    def __func__(*args, **kwargs):
+        if on_ci:
+            return
+        return func(*args, **kwargs)
+    return __func__
+class TestDistSeResneXtNCCL(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+    @skip_ci
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_se_resnext.py", delta=1e-5)
+class TestDistSeResneXtNCCLMP(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+        self._mp_mode = True
+    @skip_ci
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "dist_se_resnext.py",
+                delta=1e-5,
+                need_envs={"NCCL_P2P_DISABLE": "1"})
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 from __future__ import print_function
+import os
+os.environ['FLAGS_fuse_parameter_memory_size'] = "131072"
+os.environ['FLAGS_fuse_parameter_groups_size'] = "3"
 import paddle.fluid as fluid
 import paddle.fluid.layers.ops as ops
@@ -22,7 +25,6 @@ import paddle.fluid.core as core
 from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
 import math
-import os
 import numpy as np
 # FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
@@ -312,17 +314,59 @@ class TestResnet(TestParallelExecutorBase):
        self.assertAlmostEquals(
            np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
+    def _compare_with_fused_all_reduce(self,
+                                       model,
+                                       use_cuda,
+                                       iter=20,
+                                       delta2=1e-5):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        global remove_bn
+        remove_bn = True
+        img, label = self._init_data(batch_size=batch_size)
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            fuse_all_reduce_ops=False,
+            optimizer=optimizer)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            fuse_all_reduce_ops=True,
+            optimizer=optimizer)
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
    def test_seresnext_with_learning_rate_decay(self):
        self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
        self._check_resnet_convergence(
            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
-    def test_seresnext_with_new_strategy(self):
+    def test_seresnext_with_reduce(self):
        self._compare_reduce_and_allreduce(
            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
        self._compare_reduce_and_allreduce(
            model=SE_ResNeXt50Small, use_cuda=False, iter=5)
+    def test_seresnext_with_fused_all_reduce(self):
+        self._compare_with_fused_all_reduce(
+            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-3)
+        self._compare_with_fused_all_reduce(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
 if __name__ == '__main__':
    unittest.main()