Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-none-layers-api-doc

706f3839 · yuyang18 · fe5de04b · 9988f8ec · 706f3839 · 706f3839
21 changed file
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result")
+SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
+SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
 SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
 ExternalProject_Add(

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -295,13 +295,14 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
    const ProgramDesc& program, int block_id) {
-  auto* ctx = new ExecutorPrepareContext(program, block_id);
+  std::unique_ptr<ExecutorPrepareContext> ctx(
+      new ExecutorPrepareContext(program, block_id));
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
  }
-  return std::unique_ptr<ExecutorPrepareContext>(ctx);
+  return ctx;
 }
 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(

--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -12,16 +12,20 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "mkldnn.hpp"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 namespace paddle {
 namespace operators {
-using paddle::framework::Tensor;
+using framework::DataLayout;
-using paddle::platform::MKLDNNDeviceContext;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using platform::GetMKLDNNFormat;
+using platform::MKLDNNDeviceContext;
+using platform::to_void_cast;
 namespace {
 std::string gethash(const mkldnn::memory::dims &operand_dims,
@@ -35,188 +39,260 @@ std::string gethash(const mkldnn::memory::dims &operand_dims,
  };
  return dim2str(operand_dims) + std::to_string(algorithm);
 }
+}  // namespace
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
+    Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};
-template <typename T, typename ExecContext>
+template <typename Functor>
-void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
+class MKLDNNActivationGradKernel
-                     const T alpha = 0, const T beta = 0) {
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input OutGrad tensor");
+    Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};
+template <typename T>
+void eltwise_forward(const framework::ExecutionContext &ctx,
+                     mkldnn::algorithm algorithm, const T alpha = 0,
+                     const T beta = 0) {
  PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                 "It must use CPUPlace.");
  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
  const auto &mkldnn_engine = dev_ctx.GetEngine();
-  // get buffers
+  const auto *x = ctx.Input<Tensor>("X");
-  const auto *src = ctx.template Input<Tensor>("X");
+  auto *y = ctx.Output<Tensor>("Out");
-  const auto *src_data = src->template data<T>();
-  auto *dst = ctx.template Output<Tensor>("Out");
+  const T *x_data = x->data<T>();
-  T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+  T *y_data = y->mutable_data<T>(ctx.GetPlace());
-  // get memory dim
+  PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4,
-  PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
                 "Input dim must be with 2 or 4");
-  std::vector<int> src_tz = framework::vectorize2int(src->dims());
+  std::vector<int> src_tz = framework::vectorize2int(x->dims());
+  auto src_format =
+      src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
  const std::string key = gethash(src_tz, algorithm);
  const std::string key_src_data =
      key + ctx.op().Output("Out") + "@eltwise_fwd_src_data";
-  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
+  const std::string key_src_layout =
-  const std::string key_dst_mem = key + "@eltwise_fwd_dst_mem";
+      key + ctx.op().Output("Out") + "@eltwise_fwd_src_layout";
-  const std::string key_fwd = key + "@eltwise_fwd";
+  const std::string key_with_layout = key + std::to_string(src_format);
+  const std::string key_src_mem = key_with_layout + "@eltwise_fwd_src_mem";
+  const std::string key_dst_mem = key_with_layout + "@eltwise_fwd_dst_mem";
+  const std::string key_fwd = key_with_layout + "@eltwise_fwd";
+  const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd";
+  // save input data and layout to be referred in backward path
+  auto p_src_data = std::make_shared<const T *>(x_data);
+  dev_ctx.SetBlob(key_src_data, p_src_data);
+  auto p_src_layout = std::make_shared<memory::format>(src_format);
+  dev_ctx.SetBlob(key_src_layout, p_src_layout);
  auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
      dev_ctx.GetBlob(key_fwd));
-  // save input data to be referred in backward path
+  std::shared_ptr<memory> dst_memory;
-  auto p_src_data = std::make_shared<const T *>(src_data);
-  dev_ctx.SetBlob(key_src_data, p_src_data);
  if (p_fwd == nullptr) {
-    // create memory description
+    // create mkldnn memory for input X
-    auto data_md = src_tz.size() == 2
+    auto src_md = platform::MKLDNNMemDesc(
-                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+        src_tz, platform::MKLDNNGetDataType<T>(), src_format);
-                                                 mkldnn::memory::format::nc)
+    auto src_memory = std::shared_ptr<memory>(
-                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+        new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
-                                                 mkldnn::memory::format::nchw);
+    // save src_memory to be referred in backward path
+    dev_ctx.SetBlob(key_src_mem, src_memory);
-    // create memory primitives
-    auto p_src_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
+    // create primitive descriptor for activation forward and save it
-        {data_md, mkldnn_engine}, platform::to_void_cast(src_data)));
+    auto forward_desc = mkldnn::eltwise_forward::desc(
-    dev_ctx.SetBlob(key_src_mem, p_src_mem);
+        mkldnn::prop_kind::forward_training, algorithm,
+        src_memory->get_primitive_desc().desc(), alpha, beta);
-    auto p_dst_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
+    auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
-        {data_md, mkldnn_engine}, platform::to_void_cast(dst_data)));
+        forward_desc, mkldnn_engine);
-    dev_ctx.SetBlob(key_dst_mem, p_dst_mem);
+    // save prim desc into global device context to be referred in backward path
-    auto fwd_desc = mkldnn::eltwise_forward::desc(
+    dev_ctx.SetBlob(key_fwd_pd, forward_pd);
-        mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
-    auto p_fwd_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+    // create mkldnn memory for output y
-        fwd_desc, mkldnn_engine);
+    dst_memory =
-    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
+        std::make_shared<memory>(forward_pd->dst_primitive_desc(), y_data);
-    dev_ctx.SetBlob(key_fwd_pd, p_fwd_pd);
-    p_fwd = std::make_shared<mkldnn::eltwise_forward>(
+    dev_ctx.SetBlob(key_dst_mem, dst_memory);
-        *p_fwd_pd, *(p_src_mem.get()), *(p_dst_mem.get()));
+    // create activation primitive
+    p_fwd = std::make_shared<mkldnn::eltwise_forward>(*forward_pd, *src_memory,
+                                                      *dst_memory);
    dev_ctx.SetBlob(key_fwd, p_fwd);
  } else {
    // primitives already exist
-    auto p_src_mem =
+    auto src_memory =
        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
-    PADDLE_ENFORCE(p_src_mem != nullptr,
+    PADDLE_ENFORCE(src_memory != nullptr,
-                   "Fail to find eltwise p_src_mem in device context.");
+                   "Fail to find eltwise src_memory in device context.");
-    auto p_dst_mem =
+    dst_memory =
        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_dst_mem));
-    PADDLE_ENFORCE(p_dst_mem != nullptr,
+    PADDLE_ENFORCE(dst_memory != nullptr,
-                   "Fail to find eltwise p_src_mem in device context.");
+                   "Fail to find eltwise dst_memory in device context.");
-    p_src_mem->set_data_handle(platform::to_void_reinterpret_cast(src_data));
+    src_memory->set_data_handle(platform::to_void_cast(x_data));
-    p_dst_mem->set_data_handle(dst_data);
+    dst_memory->set_data_handle(y_data);
  }
  // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {*(p_fwd.get())};
+  std::vector<primitive> pipeline;
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  pipeline.push_back(*p_fwd);
+  stream(stream::kind::eager).submit(pipeline).wait();
+  y->set_layout(DataLayout::kMKLDNN);
+  y->set_format(GetMKLDNNFormat(*dst_memory));
 }
-template <typename T, typename ExecContext>
+template <typename T>
-void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
+void eltwise_grad(const framework::ExecutionContext &ctx,
-                  const T alpha = 0, const T beta = 0) {
+                  mkldnn::algorithm algorithm, const T alpha = 0,
+                  const T beta = 0) {
  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
  const auto &mkldnn_engine = dev_ctx.GetEngine();
-  // get buffers
+  const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  const auto *out = ctx.template Input<Tensor>("Out");
+  auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-  auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-  const auto *diff_dst = dout->template data<T>();
-  auto *dx =
+  const T *diff_y_data = diff_y->data<T>();
-      ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+  T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
-  const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
-  // get memory dim
+  std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());
-  std::vector<int> src_tz = framework::vectorize2int(out->dims());
-  const std::string key = gethash(src_tz, algorithm);
+  auto diff_y_format =
-  const std::string key_diff_src_mem = key + "@eltwise_diff_src_mem";
+      diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
-  const std::string key_diff_dst_mem = key + "@eltwise_diff_dst_mem";
-  const std::string key_grad = key + "@eltwise_grad";
+  const std::string key = gethash(diff_dst_tz, algorithm);
  const std::string key_src_data =
      key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
+  const std::string key_src_layout =
+      key + ctx.op().Input("Out") + "@eltwise_fwd_src_layout";
+  const auto p_src_layout =
+      std::static_pointer_cast<memory::format>(dev_ctx.GetBlob(key_src_layout));
+  const std::string key_src_mem =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
+  const std::string key_fwd_pd =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
+  const std::string key_with_layouts =
+      key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
+  const std::string key_diff_src_mem =
+      key_with_layouts + "@eltwise_diff_src_mem";
+  const std::string key_diff_dst_mem =
+      key_with_layouts + "@eltwise_diff_dst_mem";
+  const std::string key_grad = key_with_layouts + "@eltwise_grad";
  const auto p_src_data =
      std::static_pointer_cast<T *>(dev_ctx.GetBlob(key_src_data));
-  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
+  auto src_memory =
-  auto p_src_mem =
      std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
-  p_src_mem->set_data_handle(*p_src_data.get());
+  PADDLE_ENFORCE(src_memory != nullptr,
+                 "Fail to find src_memory in device context");
+  src_memory->set_data_handle(*p_src_data.get());
+  std::shared_ptr<memory> diff_src_memory;
-  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_forward::primitive>(
+  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_backward>(
      dev_ctx.GetBlob(key_grad));
  if (p_grad == nullptr) {
-    // create memory description
+    // create mkldnn memory for input diff_y
-    auto data_md = src_tz.size() == 2
+    auto diff_dst_md = platform::MKLDNNMemDesc(
-                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+        diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
-                                                 mkldnn::memory::format::nc)
+    auto diff_dst_memory = std::shared_ptr<memory>(
-                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+        new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
-                                                 mkldnn::memory::format::nchw);
+    dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);
-    // create memory primitives
+    // retrieve eltwise primitive desc from device context
-    std::shared_ptr<void> p_diff_src_mem =
+    auto forward_pd =
-        std::make_shared<mkldnn::memory>(mkldnn::memory(
+        std::static_pointer_cast<mkldnn::eltwise_forward::primitive_desc>(
-            {data_md, mkldnn_engine}, platform::to_void_cast(diff_src)));
+            dev_ctx.GetBlob(key_fwd_pd));
-    dev_ctx.SetBlob(key_diff_src_mem, p_diff_src_mem);
+    PADDLE_ENFORCE(forward_pd != nullptr,
-    std::shared_ptr<void> p_diff_dst_mem =
+                   "Fail to find eltwise_fwd_pd in device context");
-        std::make_shared<mkldnn::memory>(mkldnn::memory(
-            {data_md, mkldnn_engine}, platform::to_void_cast(diff_dst)));
+    // ceate primitive descriptor for activation backward
-    dev_ctx.SetBlob(key_diff_dst_mem, p_diff_dst_mem);
+    auto backward_desc = mkldnn::eltwise_backward::desc(
+        algorithm, diff_dst_memory->get_primitive_desc().desc(),
-    auto bwd_desc = mkldnn::eltwise_backward::desc(algorithm, data_md, data_md,
+        src_memory->get_primitive_desc().desc(), alpha, beta);
-                                                   alpha, beta);
+    auto backward_pd = mkldnn::eltwise_backward::primitive_desc(
+        backward_desc, mkldnn_engine, *forward_pd);
-    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
-    auto *p_fwd_pd = static_cast<mkldnn::eltwise_forward::primitive_desc *>(
+    // create mkldnn memory for output diff_src
-        dev_ctx.GetBlob(key_fwd_pd).get());
+    diff_src_memory = std::make_shared<memory>(
+        backward_pd.diff_src_primitive_desc(), diff_x_data);
-    auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
+    dev_ctx.SetBlob(key_diff_src_mem, diff_src_memory);
-        bwd_desc, mkldnn_engine, *p_fwd_pd);
+    // create activation backward primitive
    p_grad = std::make_shared<mkldnn::eltwise_backward>(
-        eltwise_bwd_prim_desc, *static_cast<mkldnn::memory *>(p_src_mem.get()),
+        backward_pd, *src_memory, *diff_dst_memory, *diff_src_memory);
-        *(static_cast<mkldnn::memory *>(p_diff_dst_mem.get())),
+    dev_ctx.SetBlob(key_grad, p_grad);
-        *(static_cast<mkldnn::memory *>(p_diff_src_mem.get())));
  } else {
    // primitives already exist
-    auto p_diff_src_mem = std::static_pointer_cast<mkldnn::memory>(
+    diff_src_memory = std::static_pointer_cast<mkldnn::memory>(
        dev_ctx.GetBlob(key_diff_src_mem));
-    auto p_diff_dst_mem = std::static_pointer_cast<mkldnn::memory>(
+    auto diff_dst_memory = std::static_pointer_cast<mkldnn::memory>(
        dev_ctx.GetBlob(key_diff_dst_mem));
-    p_diff_src_mem->set_data_handle(
+    diff_src_memory->set_data_handle(
-        platform::to_void_reinterpret_cast(diff_src));
+        platform::to_void_reinterpret_cast(diff_x_data));
-    p_diff_dst_mem->set_data_handle(
+    diff_dst_memory->set_data_handle(
-        platform::to_void_reinterpret_cast(diff_dst));
+        platform::to_void_reinterpret_cast(diff_y_data));
  }
  // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {*(p_grad.get())};
+  std::vector<primitive> pipeline;
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  pipeline.push_back(*p_grad);
+  stream(stream::kind::eager).submit(pipeline).wait();
+  diff_x->set_layout(DataLayout::kMKLDNN);
+  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
 }
-}  // anonymous namespace
 template <typename T, mkldnn::algorithm algorithm>
 struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
-  template <typename ExecContext>
+  void operator()(const framework::ExecutionContext &ctx) const {
-  void operator()(const ExecContext &ctx) const {
    eltwise_forward<T>(ctx, algorithm);
  }
 };
 template <typename T, mkldnn::algorithm algorithm>
 struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
-  template <typename ExecContext>
+  void operator()(const framework::ExecutionContext &ctx) const {
-  void operator()(const ExecContext &ctx) const {
    eltwise_grad<T>(ctx, algorithm);
  }
 };

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -19,18 +19,20 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)             \
+using paddle::framework::Tensor;
-  class OP_NAME##OpMaker                                              \
-      : public ::paddle::framework::OpProtoAndCheckerMaker {          \
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
-   public:                                                            \
+  class OP_NAME##OpMaker                                                \
-    void Make() override {                                            \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {            \
-      AddInput("X", "Input of " #OP_NAME " operator");                \
+   public:                                                              \
-      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X"); \
+    void Make() override {                                              \
-      AddAttr<bool>("use_mkldnn",                                     \
+      AddInput("X", "Input of " #OP_NAME " operator");                  \
-                    "(default false) Only used in mkldnn kernel")     \
+      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X");   \
-          .SetDefault(false);                                         \
+      AddAttr<bool>("use_mkldnn",                                       \
-      AddComment(OP_COMMENT);                                         \
+                    "(bool, default false) Only used in mkldnn kernel") \
-    }                                                                 \
+          .SetDefault(false);                                           \
+      AddComment(#OP_COMMENT);                                          \
+    }                                                                   \
  }
 #define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
@@ -58,7 +60,6 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                      const framework::OperatorWithKernel& oper,
                                      const std::string& name) {
  framework::LibraryType library{framework::LibraryType::kPlain};
  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
  auto it = oper.Attrs().find("use_mkldnn");
@@ -82,6 +83,7 @@ class ActivationOp : public framework::OperatorWithKernel {
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+ protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return GetKernelType(ctx, *this, "X");
@@ -96,6 +98,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
  }
+ protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return GetKernelType(ctx, *this, "Out");

--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -60,34 +60,45 @@ template <typename DeviceContext, typename T>
 class ConcatGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    // get output tensor that the name is not kEmptyVarName
+    std::vector<framework::Tensor*> outputs;
+    for (size_t j = 0; j < outs.size(); ++j) {
+      if (out_var_names[j] != framework::kEmptyVarName) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        outputs.push_back(outs[j]);
+      } else {
+        outputs.push_back(nullptr);
+      }
+    }
    // Sometimes direct copies will be faster, this maybe need deeply analysis.
    if (axis == 0 && outs.size() < 10) {
      size_t input_offset = 0;
-      auto in_stride = framework::stride_numel(in->dims());
+      const auto in_stride = framework::stride_numel(out_grad->dims());
-      for (auto& out : outs) {
+      for (size_t i = 0; i < outs.size(); ++i) {
-        out->mutable_data<T>(ctx.GetPlace());
+        auto out_stride = framework::stride_numel(ins[i]->dims());
-        auto out_stride = framework::stride_numel(out->dims());
+        auto* out = outputs[i];
-        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
+        if (out != nullptr) {
-                                    out_stride, in->data<T>() + input_offset,
+          StridedNumelCopyWithAxis<T>(
-                                    in_stride, out_stride[axis]);
+              ctx.device_context(), axis, out->data<T>(), out_stride,
+              out_grad->data<T>() + input_offset, in_stride, out_stride[axis]);
+        }
        input_offset += out_stride[axis];
      }
    } else {
-      std::vector<framework::Tensor> outputs(outs.size());
-      for (size_t j = 0; j < outs.size(); ++j) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        outputs[j] = *outs[j];
-      }
      auto& dev_ctx = ctx.template device_context<DeviceContext>();
      paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
          concat_grad_functor;
-      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), &outputs);
+      concat_grad_functor(dev_ctx, *out_grad, ins, static_cast<int>(axis),
+                          &outputs);
    }
  }
 };

--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Detection mAP evaluate operator.
 The general steps are as follows. First, calculate the true positive and
- false positive according to the input of detection and labels, then
+false positive according to the input of detection and labels, then
- calculate the mAP evaluate value.
+calculate the mAP evaluate value.
- Supporting '11 point' and 'integral' mAP algorithm. Please get more information
+Supporting '11 point' and 'integral' mAP algorithm. Please get more information
- from the following articles:
+from the following articles:
- https://sanchom.wordpress.com/tag/average-precision/
+https://sanchom.wordpress.com/tag/average-precision/
- https://arxiv.org/abs/1512.02325
+https://arxiv.org/abs/1512.02325
 )DOC");
  }

--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -70,35 +70,40 @@ template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {
 public:
  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const int axis,
+                  const framework::Tensor& input,
-                  std::vector<framework::Tensor>* outputs) {
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
    // TODO(zcd): Add input data validity checking
-    int num = outputs->size();
+    size_t num = outputs->size();
    int input_rows = 1;
-    auto dim_0 = outputs->at(0).dims();
+    auto dim_0 = ref_inputs[0]->dims();
    for (int i = 0; i < axis; ++i) {
      input_rows *= dim_0[i];
    }
    int input_cols = 0;
    std::vector<int64_t> output_cols(outputs->size());
-    for (int i = 0; i < num; ++i) {
+    for (size_t i = 0; i < num; ++i) {
-      int t_cols = outputs->at(i).numel() / input_rows;
+      int t_cols = ref_inputs[i]->numel() / input_rows;
      input_cols += t_cols;
      output_cols[i] = t_cols;
    }
    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
    // computation
-    for (int k = 0; k < input_rows; ++k) {
+    for (size_t k = 0; k < input_rows; ++k) {
      const T* src_ptr = input.data<T>() + k * input_cols;
      int col_idx = 0;
      for (int j = 0; j < num; ++j) {
        int col_len = output_cols[j];
-        T* dst_ptr = outputs->at(j).data<T>() + k * col_len;
+        auto* out_tensor = outputs->at(j);
-        memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+        if (out_tensor != nullptr) {
-                     sizeof(T) * col_len);
+          T* dst_ptr = out_tensor->data<T>() + k * col_len;
+          memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+                       sizeof(T) * col_len);
+        }
        col_idx += col_len;
      }
    }

--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -102,10 +102,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
    int local_col = tid_x - curr_offset;
    int segment_width = curr_col_offset - curr_offset;
    T* output_ptr = outputs_data[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (output_ptr != nullptr) {
-    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      output_ptr[tid_y * segment_width + local_col] =
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-          input_data[tid_y * in_col + tid_x];
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
  }
 }
@@ -118,10 +120,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
    int split = tid_x / fixed_out_col;
    int in_offset = tid_x - split * fixed_out_col;
    T* output_ptr = outputs_data[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (output_ptr != nullptr) {
-    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      output_ptr[tid_y * fixed_out_col + in_offset] =
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-          input_data[tid_y * in_col + tid_x];
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
  }
 }
@@ -203,17 +207,18 @@ template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const int axis,
+                  const framework::Tensor& input,
-                  std::vector<framework::Tensor>* outputs) {
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
    // TODO(zcd): Add input data validity checking
    int o_num = outputs->size();
    int out_row = 1;
-    auto dim_0 = outputs->at(0).dims();
+    auto dim_0 = ref_inputs[0]->dims();
    for (int i = 0; i < axis; ++i) {
      out_row *= dim_0[i];
    }
-    int out_col = outputs->at(0).numel() / out_row;
+    int out0_col = ref_inputs[0]->numel() / out_row;
    int in_col = 0, in_row = out_row;
    bool sameShape = true;
@@ -223,13 +228,17 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
    outputs_cols[0] = 0;
    for (int i = 0; i < o_num; ++i) {
-      int t_col = outputs->at(i).numel() / out_row;
+      int t_col = outputs->at(i)->numel() / out_row;
      if (sameShape) {
-        if (t_col != out_col) sameShape = false;
+        if (t_col != out0_col) sameShape = false;
      }
      in_col += t_col;
      outputs_cols[i + 1] = in_col;
-      outputs_ptr[i] = outputs->at(i).data<T>();
+      if (outputs->at(i) != nullptr) {
+        outputs_ptr[i] = outputs->at(i)->data<T>();
+      } else {
+        outputs_ptr[i] = nullptr;
+      }
    }
    T** dev_out_gpu_data =
@@ -255,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
    if (sameShape) {
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
+          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
    } else {
      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(

--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -57,7 +57,8 @@ template <typename DeviceContext, typename T>
 class ConcatGradFunctor {
 public:
  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const int axis, std::vector<framework::Tensor>* outputs);
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs);
 };
 }  // namespace math

--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -28,9 +28,15 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
              "Default use 100% of CPU memory for PaddlePaddle,"
              "reserve the rest for page tables, etc");
-DEFINE_uint64(
+DEFINE_uint64(initial_cpu_memory_in_mb,
-    initial_cpu_memory_in_mb, 500,
+#ifdef PADDLE_WITH_MKLDNN
-    "Default initial 500MB of CPU memory for PaddlePaddle, in MD unit.");
+              /* Aligned with mozga-intel, MKLDNN need at least 5000 MB
+               * to obtain the best performance*/
+              5000,
+#else
+              500,
+#endif
+              "Initial CPU memory for PaddlePaddle, in MD unit.");
 DEFINE_double(
    fraction_of_cuda_pinned_memory_to_use, 0.5,
@@ -59,10 +65,7 @@ inline size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
  // For distributed systems, it requires configuring and limiting
  // the fraction of memory to use.
-  return std::min(
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
-      static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
-                          CpuTotalPhysicalMemory()),
-      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
 }
 size_t CpuMinChunkSize() {
@@ -71,8 +74,11 @@ size_t CpuMinChunkSize() {
 }
 size_t CpuMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
-  return CpuMaxAllocSize() / 32;
+  // or the initial_cpu_memory_in_mb.
+  return std::min(
+      static_cast<size_t>(CpuMaxAllocSize() / 32),
+      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
 }
 size_t CUDAPinnedMaxAllocSize() {

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -30,8 +30,9 @@ int main(int argc, char** argv) {
  new_argv.push_back(
      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn"));
+  new_argv.push_back(strdup(
-  new_argv.push_back(strdup("--undefok=use_mkldnn"));
+      "--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_mb"));
+  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
 #endif
  int new_argc = static_cast<int>(new_argv.size());
  char** new_argv_address = new_argv.data();

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -117,7 +117,7 @@ def __bootstrap__():
    read_env_flags = [
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn'
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
    ]
    if core.is_compiled_with_cuda():
        read_env_flags += [

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -644,7 +644,13 @@ class Operator(object):
    def set_attr(self, name, val):
        self.attrs[name] = val
-        self.desc.set_attr(name, val)
+        if isinstance(val, Block):
+            self.desc.set_block_attr(name, val.desc)
+        elif isinstance(val, core.BlockDesc) or \
+                isinstance(val, core.ProgramDesc):
+            self.desc.set_serialized_attr(name, val.serialize_to_string())
+        else:
+            self.desc.set_attr(name, val)
    @property
    def attr_names(self):

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -16,7 +16,7 @@ All layers just related to the detection neural network.
 """
 from layer_function_generator import generate_layer_fn
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
 import tensor
 import nn
@@ -155,7 +155,7 @@ def detection_output(loc,
    return nmsed_outs
-@autodoc()
+@templatedoc()
 def detection_map(detect_res,
                  label,
                  class_num,
@@ -166,6 +166,47 @@ def detection_map(detect_res,
                  input_states=None,
                  out_states=None,
                  ap_version='integral'):
+    """
+    ${comment}
+    Args:
+        detect_res: ${detect_res_comment}
+        label:  ${label_comment}
+        class_num: ${class_num_comment}
+        background_label: ${background_label_comment}
+        overlap_threshold: ${overlap_threshold_comment}
+        evaluate_difficult: ${evaluate_difficult_comment}
+        has_state: ${has_state_comment}
+        input_states: If not None, It contains 3 elements:
+            1. pos_count ${pos_count_comment}.
+            2. true_pos ${true_pos_comment}.
+            3. false_pos ${false_pos_comment}.
+        out_states: If not None, it contains 3 elements.
+            1. accum_pos_count ${accum_pos_count_comment}.
+            2. accum_true_pos ${accum_true_pos_comment}.
+            3. accum_false_pos ${accum_false_pos_comment}.
+        ap_version: ${ap_type_comment}
+    Returns:
+        ${map_comment}
+    Examples:
+          .. code-block:: python
+            detect_res = fluid.layers.data(
+                name='detect_res',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            label = fluid.layers.data(
+                name='label',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            map_out = fluid.layers.detection_map(detect_res, label, 21)
+    """
    helper = LayerHelper("detection_map", **locals())
    def __create_var(type):

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -93,6 +93,7 @@ __all__ = [
    'mean_iou',
    'relu',
    'log',
+    'crop',
 ]
@@ -5003,3 +5004,101 @@ def mean_iou(input, label, num_classes):
        },
        attrs={"num_classes": num_classes})
    return out_mean_iou, out_wrong, out_correct
+def crop(x, shape=None, offsets=None, name=None):
+    """
+    Crop input into output, as specified by offsets and shape.
+    .. code-block:: text
+        * Case 1:
+            Given
+                X = [[0, 1, 2, 0, 0]
+                     [0, 3, 4, 0, 0]
+                     [0, 0, 0, 0, 0]],
+            and
+                shape = [2, 2],
+                offsets = [0, 1],
+            output is:
+                Out = [[1, 2],
+                       [3, 4]].
+        * Case 2:
+            Given
+                X = [[0, 1, 2, 5, 0]
+                     [0, 3, 4, 6, 0]
+                     [0, 0, 0, 0, 0]],
+            and shape is tensor
+                shape = [[0, 0, 0]
+                         [0, 0, 0]]
+            and
+                offsets = [0, 1],
+            output is:
+                Out = [[1, 2, 5],
+                       [3, 4, 6]].
+    Args:
+        x (Variable): The input tensor variable.
+        shape (Variable|list/tuple of integer): The output shape is specified
+            by `shape`, which can a Variable or a list/tupe of integer.
+            If a tensor Variable, it's rank must be the same as `x`. This way
+            is suitable for the case that the output shape may be changed each
+            iteration. If a list/tupe of integer, it's length must be the same
+            as the rank of `x`
+        offsets (Variable|list/tuple of integer|None): Specifies the copping
+            offsets at each dimension. It can be a Variable or or a list/tupe
+            of integer. If a tensor Variable, it's rank must be the same as `x`.
+            This way is suitable for the case that the offsets may be changed
+            each iteration. If a list/tupe of integer, it's length must be the
+            same as the rank of `x`. If None, the offsets are 0 at each
+            dimension.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+    Returns:
+        Variable: The cropped tensor variable.
+    Raises:
+        ValueError: If shape is not a list, tuple or Variable.
+    Examples:
+        .. code-block:: python
+            x = fluid.layers.data(name="x", shape=[3, 5], dtype="float32")
+            y = fluid.layers.data(name="y", shape=[2, 3], dtype="float32")
+            crop = fluid.layers.crop(x, shape=y)
+            # or
+            z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32")
+            crop = fluid.layers.crop(z, shape=[2, 3])
+    """
+    helper = LayerHelper('crop', **locals())
+    if not (isinstance(shape, list) or isinstance(shape, tuple) or \
+        isinstance(shape, Variable)):
+        raise ValueError("The shape should be a list, tuple or Variable.")
+    if offsets is None:
+        offsets = [0] * len(x.shape)
+    out = helper.create_tmp_variable(x.dtype)
+    ipts = {'X': x}
+    attrs = {}
+    if isinstance(shape, Variable):
+        ipts['Y'] = shape
+    else:
+        attrs['shape'] = shape
+    if isinstance(offsets, Variable):
+        ipts['Offsets'] = offsets
+    else:
+        attrs['offsets'] = offsets
+    helper.append_op(
+        type='crop',
+        inputs=ipts,
+        outputs={'Out': out},
+        attrs=None if len(attrs) == 0 else attrs)
+    return out
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -26,10 +26,10 @@ from clip import append_gradient_clip_ops, error_clip_callback
 from contextlib import contextmanager
 __all__ = [
-    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
+    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
-    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'AdadeltaOptimizer',
+    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
-    'RMSPropOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer'
+    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer'
 ]
@@ -794,7 +794,7 @@ class AdadeltaOptimizer(Optimizer):
        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
    Args:
-        learning_rate(float): global leraning rate
+        learning_rate(float): global learning rate
        rho(float): rho in equation
        epsilon(float): epsilon in equation
@@ -895,9 +895,9 @@ class RMSPropOptimizer(Optimizer):
    Args:
-        learning_rate(float): global leraning rate.
+        learning_rate(float): global learning rate.
-        rho(float): rho is :math:`\\rho` in equation, set 0.95 by default.
+        rho(float): rho is :math: `\\rho` in equation, set 0.95 by default.
-        epsilon(float): :math:`\\epsilon` in equation is smoothing term to
+        epsilon(float): :math: `\\epsilon` in equation is smoothing term to
            avoid division by zero, set 1e-6 by default.
        momentum(float): :math:`\\beta` in equation is the momentum term,
            set 0.0 by default.
@@ -976,6 +976,113 @@ class RMSPropOptimizer(Optimizer):
        return rmsprop_op
+class FtrlOptimizer(Optimizer):
+    """
+    FTRL (Follow The Regularized Leader) Optimizer.
+    The paper that proposed Follow The Regularized Leader (FTRL):
+    (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+    ..  math::
+        &new\_accum = squared\_accum + grad^2
+        &if (lr\_power == -0.5):
+        &\quad  linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param}
+        &else:
+        &\quad   linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param}
+        &x = l1 * sign(linear\_accum) - linear\_accum
+        &if (lr\_power == -0.5):
+        &\quad   y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2)
+        &\quad   pre\_shrink = \\frac{x}{y}
+        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
+        &else:
+        &\quad   y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2)
+        &\quad   pre\_shrink = \\frac{x}{y}
+        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
+        &squared\_accum += grad^2
+    Args:
+        learning_rate (float|Variable): global learning rate.
+        l1 (float):
+        l2 (float):
+        lr_power (float):
+    Raises:
+        ValueError: If learning_rate, rho, epsilon, momentum are None.
+    Examples:
+          .. code-block:: python
+              optimizer = fluid.optimizer.Ftrl(0.0001)
+              _, params_grads = optimizer.minimize(cost)
+    """
+    _squared_acc_str = "squared"
+    _linear_acc_str = "linear"
+    def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs):
+        super(FtrlOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        self.type = "ftrl"
+        self._l1 = l1
+        self._l2 = l2
+        self._lr_power = lr_power
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+        for p in parameters:
+            self._add_accumulator(self._squared_acc_str, p)
+            self._add_accumulator(self._linear_acc_str, p)
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+        squared_acc = self._get_accumulator(self._squared_acc_str,
+                                            param_and_grad[0])
+        linear_acc = self._get_accumulator(self._linear_acc_str,
+                                           param_and_grad[0])
+        ftrl_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "SquaredAccumulator": squared_acc,
+                "LinearAccumulator": linear_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "SquaredAccumOut": squared_acc,
+                "LinearAccumOut": linear_acc
+            },
+            attrs={"l1": self._l1,
+                   "l2": self._l1,
+                   "lr_power": self._lr_power})
+        return ftrl_op
 # We short the class name, since users will use the optimizer with the package
 # name. The sample code:
 #
@@ -992,6 +1099,7 @@ Adamax = AdamaxOptimizer
 DecayedAdagrad = DecayedAdagradOptimizer
 Adadelta = AdadeltaOptimizer
 RMSProp = RMSPropOptimizer
+Ftrl = FtrlOptimizer
 class ModelAverage(Optimizer):

--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -42,6 +42,9 @@ def cuda_profiler(output_file, output_mode=None, config=None):
    counters/options for profiling by `config` argument. The default config
    is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+    Then users can use NVIDIA Visual Profiler
+    (https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
+    this output file to visualize results.
    Args:
        output_file (string) : The output file name, the result will be
@@ -50,6 +53,33 @@ def cuda_profiler(output_file, output_mode=None, config=None):
            Comma separated values format. It should be 'kvp' or 'csv'.
        config (list of string) : The profiler options and counters can refer
            to "Compute Command Line Profiler User Guide".
+    Raises:
+        ValueError: If `output_mode` is not in ['kvp', 'csv'].
+    Examples:
+        .. code-block:: python
+            import paddle.fluid as fluid
+            import paddle.fluid.profiler as profiler
+            epoc = 8
+            dshape = [4, 3, 28, 28]
+            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            output_file = 'cuda_profiler.txt'
+            with profiler.cuda_profiler(output_file, 'csv') as nvprof:
+                for i in range(epoc):
+                    input = np.random.random(dshape).astype('float32')
+                    exe.run(fluid.default_main_program(), feed={'data': input})
+            # then use  NVIDIA Visual Profiler (nvvp) to load this output file
+            # to visualize results.
    """
    if output_mode is None:
        output_mode = 'csv'
@@ -69,19 +99,52 @@ def cuda_profiler(output_file, output_mode=None, config=None):
 def reset_profiler():
-    """The profiler clear interface.
+    """
-    reset_profiler will clear the previous time record.
+    Clear the previous time record. This interface does not work for
+    `fluid.profiler.cuda_profiler`, it only works for
+    `fluid.profiler.start_profiler`, `fluid.profiler.stop_profiler`,
+    and `fluid.profiler.profiler`.
+    Examples:
+        .. code-block:: python
+            import paddle.fluid.profiler as profiler
+            with profiler.profiler(state, 'total', '/tmp/profile'):
+                for iter in range(10):
+                    if iter == 2:
+                        profiler.reset_profiler()
+                    # ...
    """
    core.reset_profiler()
 def start_profiler(state):
-    """Enable the profiler.
+    """
+    Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
+    `fluid.profiler.stop_profiler` to insert the code, except the usage of
+    `fluid.profiler.profiler` interface.
    Args:
        state (string) : The profiling state, which should be 'CPU', 'GPU'
            or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
            GPU as well. 'All' also generates timeline.
+    Raises:
+        ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
+    Examples:
+        .. code-block:: python
+            import paddle.fluid.profiler as profiler
+            profiler.start_profiler('GPU')
+            for iter in range(10):
+                if iter == 2:
+                    profiler.reset_profiler()
+                # except each iteration
+            profiler.stop_profiler('total', '/tmp/profile')
    """
    if core.is_profiler_enabled():
        return
@@ -97,7 +160,10 @@ def start_profiler(state):
 def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
-    """Stop the profiler.
+    """
+    Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
+    `fluid.profiler.stop_profiler` to insert the code, except the usage of
+    `fluid.profiler.profiler` interface.
    Args:
        sorted_key (string) : If None, the profiling results will be printed
@@ -111,6 +177,23 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
            The `ave` means sorting by the average execution time.
        profile_path (string) : If state == 'All', it will write a profile
            proto output file.
+    Raises:
+        ValueError: If `sorted_key` is not in
+            ['calls', 'total', 'max', 'min', 'ave'].
+    Examples:
+        .. code-block:: python
+            import paddle.fluid.profiler as profiler
+            profiler.start_profiler('GPU')
+            for iter in range(10):
+                if iter == 2:
+                    profiler.reset_profiler()
+                # except each iteration
+            profiler.stop_profiler('total', '/tmp/profile')
    """
    if not core.is_profiler_enabled():
        return
@@ -137,7 +220,12 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
    Different from cuda_profiler, this profiler can be used to profile both CPU
    and GPU program. By defalut, it records the CPU and GPU operator kernels,
    if you want to profile other program, you can refer the profiling tutorial
-    to add more records.
+    to add more records in C++ code.
+    If the state == 'All', a profile proto file will be written to
+    `profile_path`. This file records timeline information during the execution.
+    Then users can visualize this file to see the timeline, please refer 
+    https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
    Args:
        state (string) : The profiling state, which should be 'CPU' or 'GPU',
@@ -156,6 +244,25 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
            The `ave` means sorting by the average execution time.
        profile_path (string) : If state == 'All', it will write a profile
            proto output file.
+    Raises:
+        ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
+            not in ['calls', 'total', 'max', 'min', 'ave'].
+    Examples:
+        .. code-block:: python
+            import paddle.fluid.profiler as profiler
+            with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+                for pass_id in range(pass_num):
+                    for batch_id, data in enumerate(train_reader()):
+                        exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[],
+                                use_program_cache=True)
+                        # ...
    """
    start_profiler(state)
    yield

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -401,6 +401,15 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(output)
        print(str(program))
+    def test_maxout(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 5], dtype="float32")
+            y = layers.data(name='y', shape=[2, 3], dtype="float32")
+            output = layers.crop(x, shape=y)
+            self.assertIsNotNone(output)
+        print(str(program))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -434,5 +434,71 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+class TestFtrlOptimizer(unittest.TestCase):
+    class MockFtrl(optimizer.FtrlOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+        def get_squared_str(self):
+            return self._squared_acc_str
+        def get_linear_str(self):
+            return self._linear_acc_str
+    def test_ftrl_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="mul.x",
+            optimize_attr={'learning_rate': 1.1})
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        learning_rate = 0.01
+        ftrl_optimizer = self.MockFtrl(
+            learning_rate=learning_rate, l1=0.0, l2=0.0, lr_power=-0.5)
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
+        opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out,
+                                                       init_program)
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "ftrl"])
+        # Check accumulators
+        accumulators = ftrl_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 2)
+        self.assertTrue(ftrl_optimizer.get_squared_str() in accumulators)
+        self.assertTrue(ftrl_optimizer.get_linear_str() in accumulators)
+        squared_acc = accumulators[ftrl_optimizer.get_squared_str()]
+        linear_acc = accumulators[ftrl_optimizer.get_linear_str()]
+        self.assertEqual(len(squared_acc), 1)
+        self.assertEqual(len(linear_acc), 1)
+        self.assertTrue(mul_x.name in squared_acc)
+        self.assertTrue(mul_x.name in linear_acc)
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 3)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -24,7 +24,7 @@ Steps to transpile trainer:
 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
 2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
 3. modify trainer program add split_op to each grad variable.
-4. append send_op to send splited variables to server and 
+4. append send_op to send splited variables to server and
 5. add recv_op to fetch params(splited blocks or origin param) from server.
 6. append concat_op to merge splited blocks to update local weights.
@@ -44,7 +44,7 @@ import numpy as np
 from ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
-                        default_startup_program, \
+                        default_startup_program, Block, \
                        Variable, Parameter, grad_var_name
 from details import *
@@ -471,7 +471,7 @@ class DistributeTranspiler:
                self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
                                         self.origin_program, merged_var)
            else:
-                self._append_pserver_non_opt_ops(block, op, endpoint)
+                self._append_pserver_non_opt_ops(block, op)
        def __op_have_grad_input__(op):
            for varname in op.input_arg_names:
@@ -479,13 +479,39 @@ class DistributeTranspiler:
                    return varname
            return ""
+        def __clone_lr_op_sub_block__(op, program, new_block):
+            if not op.has_attr('sub_block'):
+                return
+            origin_block_desc = op.attr('sub_block')
+            origin_block = self.origin_program.block(origin_block_desc.id)
+            assert isinstance(origin_block, Block)
+            # we put the new sub block to new block to follow the block
+            # hierarchy of the original blocks
+            new_sub_block = program.create_block(new_block.idx)
+            # clone vars
+            for var in origin_block.vars:
+                new_sub_block.clone_variable(var)
+            # clone ops
+            for op in origin_block.ops:
+                self._clone_lr_op(program, new_sub_block, op)
+                # clone sub_block of op
+                __clone_lr_op_sub_block__(op, program, new_sub_block)
+            # reset the block of op
+            op.set_attr('sub_block', new_sub_block)
        # append lr decay ops to the child block if exists
        lr_ops = self._get_lr_ops()
        if len(lr_ops) > 0:
            lr_decay_block = pserver_program.create_block(
                pserver_program.num_blocks - 1)
            for _, op in enumerate(lr_ops):
-                self._append_pserver_non_opt_ops(lr_decay_block, op, endpoint)
+                self._append_pserver_non_opt_ops(lr_decay_block, op)
+                # append sub blocks to pserver_program in lr_decay_op
+                __clone_lr_op_sub_block__(op, pserver_program, lr_decay_block)
        # append op to the current block
        grad_to_block_id = []
@@ -1116,7 +1142,29 @@ class DistributeTranspiler:
                    break
        return grad_block
-    def _append_pserver_non_opt_ops(self, optimize_block, opt_op, endpoint):
+    def _clone_lr_op(self, program, block, op):
+        inputs = self._get_input_map_from_op(
+            self.origin_program.global_block().vars, op)
+        for key, varlist in inputs.iteritems():
+            if not isinstance(varlist, list):
+                varlist = [varlist]
+            for var in varlist:
+                if var not in program.global_block().vars:
+                    block.clone_variable(var)
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, op)
+        for key, varlist in outputs.iteritems():
+            if not isinstance(varlist, list):
+                varlist = [varlist]
+            for var in varlist:
+                if var not in program.global_block().vars:
+                    block.clone_variable(var)
+        block.append_op(
+            type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)
+    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
        program = optimize_block.program
        # Append the ops for parameters that do not need to be optimized/updated
        inputs = self._get_input_map_from_op(

--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -19,16 +19,30 @@ from ..executor import global_scope
 class InferenceTranspiler:
+    '''
+    Convert the fluid program to optimized inference program. 
+    There are several optimizations, only fuse batch normalization is supported now.
+    Examples:
+    .. code-block:: python
+        # As InferenceTranspiler will modify the original program,
+        # please clone before use it.
+        inference_transpiler_program = program.clone()
+        t = fluid.InferenceTranspiler()
+        t.transpile(inference_transpiler_program, place)
+    '''
    def transpile(self, program, place, scope=None):
        '''
-        Transpile the program. Support only fuse batch normalization now.
+        Run the transpiler.
-        :param program: program to transpile 
+        Args:
-        :type program: Program
+            program (Program): program to transpile
-        :param place: inference place 
+            place (Place): inference place
-        :type place: Place
+            scope (Scope|None): inference Scope
-        :param scope: inference scope 
-        :type scope: Scope or None
        '''
        if not isinstance(program, Program):
            raise TypeError("program should be as Program type")
@@ -49,36 +63,43 @@ class InferenceTranspiler:
        can be integrated with them. Doing so will give us a forward acceleration, 
        especially in environments like mobile or embedded.
-        For input X:
+        For input :math:`X`:
-        - Conv process:        X = input * W + bias 
-        - Batch norm process:  X' = (X - mean) / std 
+        - Conv process:        :math:`X = input * W + bias` 
-        - Scale Process:       Y = a * X' + b
+        - Batch norm process:  :math:`X' = (X - mean) / std` 
+        - Scale Process:       :math:`Y = a * X' + b`
        After fuse into one operation:
-        Y = (input * W + bias - mean) / std * a + b
+        .. math::
-          = input * a * W / std + ((bias - mean) / std * a + b)
+            Y &= (input * W + bias - mean) / std * a + b \\\\
+              &= input * a * W / std + ((bias - mean) / std * a + b)
        The operator transformation is: 
        - before:
          - conv->batch_norm->any_other_op (bias == 0)
          - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
        - after: 
          - conv->elementwise_add->any_other_op
        The transpile stages are:
        1. insert elementwise_add op when bias == 0.
        2. fuse the batch_norm's parameters to conv and elementwise_add operators.
        3. remove batch_norm ops which are not used in any other ops.
        4. adjust the input of any_other_op to be the output of elementwise_add operator.
        5. remove unused variables.
-        :param program: program to transpile 
+        Args:
-        :type program: Program
+            program (Program): program to transpile
-        :param place: inference place 
+            place (Place): inference place
-        :type place: Place
+            scope (Scope): inference Scope
-        :param scope: inference scope 
-        :type scope: Scope
        '''
        self.scope = scope
        self.place = place