Merge branch 'develop' into core_inference_fix_run

987a32da · Liu Yiqun · 09687534 · b594251f · 987a32da · 987a32da
24 changed file
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+# Fluid 分布式版本使用指南
+本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行，以及将单机训练脚本改造成支持集群训练的版本
+## 准备工作
+* 可用的集群
+    包含一个或多个计算节点的集群，每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址，集群内的所有计算节点可以通过网络相互通信。
+* 安装PaddlePaddle Fluid with Distribution版本
+    所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。
+    **注意：**当前对外提供的PaddlePaddle版本并不支持分布式，需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。
+    cmake编译命令中需要将WITH_DISTRIBUTE设置为ON，下面是一个cmake编译指令示例：
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+## 更新训练脚本
+这里，我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例，描述如何将单机训练脚本改造成支持集群训练的版本。
+### 单机训练脚本示例
+```python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+BATCH_SIZE = 20
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+exe.run(fluid.default_startup_program())
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+我们创建了一个简单的全连接神经网络程序，并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。
+### 介绍Parameter Server
+在非分布式版本的训练脚本中，只存在Trainer一种角色，它不仅处理常规的计算任务，也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中，由于存在多个Trainer节点进行同样的数据计算任务，因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中，我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md)
+**因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
+### 分布式训练 
+Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
+```python
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) 
+```
+将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
+```python
+... #define the program, cost, and create sgd optimizer
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+... #create executor
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+```
+### 分布式训练脚本运行说明
+分布式任务的运行需要将表格中说明的多个参数进行赋值:
+| 参数名 | 值类型 | 说明 | 示例 |
+|:-------------|:------|:---------------------------------------|:-------------|
+| trainer_id | int | 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 | 0/1/2/3 |
+| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
+| trainers | int | 训练节点的总个数，>0的数字 | 4 |
+| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
+| training_role | str | 节点角色， TRAINER/PSERVER | PSERVER |
+**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下： 
+```python
+t = fluid.DistributeTranspiler()
+t.transpile(
+    optimize_ops,
+    params_grads,
+    trainer_id,
+    pservers=pserver,
+    trainers=trainers)
+if training_role == "PSERVER":
+    pserver_prog = t.get_pserver_program(server_endpoint)
+    pserver_startup = t.get_startup_program(server_endpoint, pserver_prog)
+```
+### Demo
+完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。
+第一步，进入demo代码所在目录：
+```bash
+cd /paddle/python/paddle/fluid/tests/book
+```
+第二步，启动Parameter Server：
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+```
+执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
+第三步，启动Trainer：
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+```
+由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。
+现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。
--- a/doc/v2/faq/index_en.rst
+++ b/doc/v2/faq/index_en.rst
 FAQ
 ====
+This document provides answers to some of the frequently asked questions about PaddlePaddle. If you have a question that is not covered here, please go to `PaddlePaddle Community <https://github.com/PaddlePaddle/Paddle/issues>`_ , to find an answer or submit new `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_  , we will reply in time.
 ..  toctree::
  :maxdepth: 1

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -613,3 +613,14 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
                                ops::grad_functor<double>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
+REGISTER_OP_CPU_KERNEL(relu,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ReluFunctor<float>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ReluFunctor<double>>);
+REGISTER_OP_CPU_KERNEL(
+    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                         ops::ReluGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::ReluGradFunctor<double>>);
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
@@ -31,3 +32,16 @@ namespace ops = paddle::operators;
                                ops::grad_functor<double>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
+REGISTER_OP_CUDA_KERNEL(
+    relu, ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                                ops::ReluFunctor<float>>,
+    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                          ops::ReluFunctor<double>>,
+    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                          ops::ReluFunctor<paddle::platform::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    relu_grad, ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                         ops::ReluGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::ReluGradFunctor<double>>);
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -772,7 +772,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
-  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \

--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/average_accumulates_op.h"
+namespace paddle {
+namespace operators {
+template <>
+void GetAccumulators<paddle::platform::CPUDeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t& num_updates_,
+    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
+  old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
+  num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
+  num_updates_ = in_num_updates->data<int64_t>()[0];
+}
+template <>
+void SetAccumulators<paddle::platform::CPUDeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t num_updates_,
+    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
+  out_num_updates->data<int64_t>()[0] = num_updates_;
+}
+class AverageAccumulatesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("param"),
+        "Input (param) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_1"),
+        "Input (sum_1) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_2"),
+        "Input (sum_2) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_3"),
+        "Input (sum_3) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_num_accumulates"),
+        "Input (in_num_accumulates) of average_accumulates op should "
+        "not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
+                   "Input (old_num_accumulates) of average_accumulates op "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_num_updates"),
+        "Input (num_updates) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_1"),
+        "Output (sum_1) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_2"),
+        "Output (sum_2) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_3"),
+        "Output (sum_3) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
+                   "Output (num_accumulates) of average_accumulates op should "
+                   "not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
+                   "Output (old_num_accumulates) of average_accumulates op "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_num_updates"),
+        "Output (num_updates) of average_accumulates op should not be null.");
+    auto in_dim = ctx->GetInputDim("param");
+    ctx->SetOutputDim("out_sum_1", in_dim);
+    ctx->SetOutputDim("out_sum_2", in_dim);
+    ctx->SetOutputDim("out_sum_3", in_dim);
+    ctx->SetOutputDim("out_num_accumulates", {1});
+    ctx->SetOutputDim("out_old_num_accumulates", {1});
+    ctx->SetOutputDim("out_num_updates", {1});
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("param")->type()),
+        ctx.GetPlace());
+  }
+};
+class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("param", "(Tensor), The parameter to be accumulated.");
+    AddInput("in_sum_1",
+             "(Tensor), A tensor used to store the parameter "
+             "sums with the same shape as input(param).");
+    AddInput("in_sum_2",
+             "(Tensor), A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param). It is used to avoid loss of precision due to too "
+             "many sums.");
+    AddInput("in_sum_3",
+             "(Tensor), A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param).");
+    AddInput("in_num_accumulates",
+             "(Tensor<int64_t>), The accumulating times of current window with "
+             "shape [1].");
+    AddInput(
+        "in_old_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of previous window with "
+        "shape [1].");
+    AddInput("in_num_updates",
+             "(Tensor<int64_t>), The total number of batches used by trainning "
+             "before this batch with shape [1].");
+    AddOutput("out_sum_1",
+              "(Tensor), A tensor used to store the "
+              "parameter sums with the same shape as input(param).");
+    AddOutput("out_sum_2",
+              "(Tensor), A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param). It is used to avoid loss of precision due to too "
+              "many sums.");
+    AddOutput("out_sum_3",
+              "(Tensor), A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param).");
+    AddOutput(
+        "out_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of current window with "
+        "shape [1].");
+    AddOutput(
+        "out_old_num_accumulates",
+        "(Tensor<int64_t>) The accumulating times of previous window with "
+        "shape [1].");
+    AddOutput(
+        "out_num_updates",
+        "(Tensor<int64_t>), The total number of batches used by trainning "
+        "before this batch with shape [1].");
+    AddAttr<float>("average_window",
+                   "(float, default 0) "
+                   "The rate of average window size relative to num_updates.")
+        .SetDefault(0);
+    AddAttr<int64_t>("max_average_window",
+                     "(int64_t) "
+                     "Maximum size of average window. It suggests that the "
+                     "number of mini-batches "
+                     "in one pass is appropriate value to set.");
+    AddAttr<int64_t>("min_average_window",
+                     "(int64_t, default 10000L) "
+                     "Minimu size of average window.")
+        .SetDefault(10000L);
+    AddComment(R"DOC(
+AverageAccumulates Operator.
+Accumulate the sum of parameter whtin sliding window. The size of sliding window is
+determined by 'average_window', 'max_average_window' and 'min_average_window'.
+Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
+'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
+All the accumulators were inited to zero before training.
+And for a mini-batch in training, accumulators were computed as below steps:
+    num_updates += 1
+    num_accumulates += 1
+    sum_1 += param
+    if num_updates % kMaxNumAccumulates == 0:
+        sum_2 += sum_1
+        sum_1 = 0
+    if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
+        sum_3 = sum_1 + sum_2
+        sum_1 = 0
+        sum_2 = 0
+        old_num_accumulates = num_accumulates
+        num_accumulates = 0
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
+                  ops::AverageAccumulatesOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    average_accumulates,
+    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/average_accumulates_op.h"
+#include "paddle/fluid/platform/gpu_info.h"
+namespace paddle {
+namespace operators {
+template <>
+void GetAccumulators<paddle::platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t& num_updates_,
+    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
+  auto stream = ctx.cuda_device_context().stream();
+  memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
+               platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
+               sizeof(int64_t), stream);
+  memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
+               in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
+  memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
+               in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
+}
+template <>
+void SetAccumulators<paddle::platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t num_updates_,
+    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+  auto stream = ctx.cuda_device_context().stream();
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+  memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
+               platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
+               platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
+               platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
+}
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    average_accumulates,
+    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename DeviceContext>
+void GetAccumulators(const framework::ExecutionContext& ctx,
+                     int64_t& num_updates, int64_t& num_accumulates,
+                     int64_t& old_num_accumulates);
+template <typename DeviceContext>
+void SetAccumulators(const framework::ExecutionContext& ctx,
+                     int64_t num_updates, int64_t num_accumulates,
+                     int64_t old_num_accumulates);
+template <typename DeviceContext, typename T>
+class AverageAccumulatesKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // It is used to avoid loss of precision
+    static const int64_t kMaxNumAccumulates = 16384;
+    // Get accumulators from input
+    int64_t num_updates = 0;
+    int64_t num_accumulates = 0;
+    int64_t old_num_accumulates = 0;
+    GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+                                   old_num_accumulates);
+    // Get attrs
+    float average_window = ctx.Attr<float>("average_window");
+    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
+    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
+    min_average_window =
+        std::min<int64_t>(min_average_window, max_average_window);
+    // Get inputs
+    auto* param = ctx.Input<Tensor>("param");
+    auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
+    auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
+    auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
+    auto param_tensor = EigenVector<T>::Flatten(*param);
+    auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
+    auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
+    auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
+    // Get outputs
+    auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
+    auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
+    auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+    auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
+    auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
+    auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
+    // Compute
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    math::SetConstant<DeviceContext, T> constant_functor;
+    ++num_updates;
+    ++num_accumulates;
+    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
+    out_sum_2_tensor.device(place) = in_sum_2_tensor;
+    out_sum_3_tensor.device(place) = in_sum_3_tensor;
+    if (num_updates % kMaxNumAccumulates == 0) {
+      // Move the sum to a different buffer to avoid loss of precision due to
+      // too many sums.
+      out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
+                       0.0);
+    }
+    if (num_accumulates >= min_average_window &&
+        num_accumulates >= std::min<int64_t>(max_average_window,
+                                             num_updates * average_window)) {
+      //  Now the average window is too long, discard the old sum.
+      out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
+                       0.0);
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
+                       0.0);
+      old_num_accumulates = num_accumulates;
+      num_accumulates = 0;
+    }
+    // Set accumulators to output
+    SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+                                   old_num_accumulates);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -78,7 +78,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
      for (int64_t i = 0; i < batch_size; ++i) {
        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
        int64_t index = i * class_num + label_data[i];
-        dx_data[index] = -dy_data[i] / x_data[index];
+        dx_data[index] = math::TolerableValue<T>()(-dy_data[i] / x_data[index]);
      }
    }
  }

--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -89,6 +89,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
      XGrad->mutable_data<T>(context.GetPlace())));
 }
+template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<float>;

--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -56,7 +56,9 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(softmax, CUDNN, ::paddle::platform::CUDAPlace,
+namespace plat = paddle::platform;
-                   ops::SoftmaxCUDNNKernel<float>);
+REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
-REGISTER_OP_KERNEL(softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::SoftmaxCUDNNKernel<float>,
+                   ops::SoftmaxCUDNNKernel<plat::float16>);
+REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
                   ops::SoftmaxGradCUDNNKernel<float>);
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "mkldnn.hpp"
+#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include <iostream>
+namespace paddle {
+namespace operators {
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::MKLDNNMemDesc;
+using mkldnn::memory;  // Note: paddle has also "memory" namespace
+using mkldnn::primitive;
+using mkldnn::softmax_forward;
+using mkldnn::prop_kind;
+using mkldnn::stream;
+template <typename T>
+class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* input = ctx.Input<Tensor>("X");
+    Tensor* output = ctx.Output<Tensor>("Out");
+    PADDLE_ENFORCE(input->dims().size() == 2UL,
+                   "The input of softmax op must be a 2D matrix.");
+    const T* input_data = input->data<T>();
+    // allocate memory for output
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
+    // we will make normalization after final eg. axis: 1
+    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
+                   "Softmax input and output dimensions should match");
+    // Same memory descriptor to be used for input and output
+    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
+    // Currently only supports NC data format
+    // TODO(jczaja-intel): support more formats
+    auto softmax_md =
+        MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
+                                              softmax_md, 1 /*dim: C*/);
+    // create memory primitives
+    auto softmax_src_memory =
+        memory({softmax_md, mkldnn_engine}, (void*)input_data);
+    auto softmax_dst_memory =
+        memory({softmax_md, mkldnn_engine}, (void*)output_data);
+    auto softmax_prim_desc =
+        softmax_forward::primitive_desc(softmax_desc, mkldnn_engine);
+    auto softmax = softmax_forward(softmax_prim_desc, softmax_src_memory,
+                                   softmax_dst_memory);
+    std::vector<primitive> pipeline{softmax};
+    stream(stream::kind::eager).submit(pipeline).wait();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::SoftmaxMKLDNNKernel<float>);
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -13,7 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/softmax_op.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -38,26 +44,32 @@ class SoftmaxOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
-    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+    framework::LibraryType library_{framework::LibraryType::kPlain};
-    bool runtime_cudnn_support = false;
 #ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(ctx.GetPlace())) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
-      auto& dev_ctx =
+      library_ = framework::LibraryType::kCUDNN;
-          ctx.template device_context<platform::CUDADeviceContext>();
-      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
    }
 #endif
-    framework::LibraryType library_ = framework::LibraryType::kPlain;
+#ifdef PADDLE_WITH_MKLDNN
-    if (use_cudnn && runtime_cudnn_support) {
+    if (library_ == framework::LibraryType::kPlain &&
-      library_ = framework::LibraryType::kCUDNN;
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
    }
+#endif
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
+                        "float16 can only be used when CUDNN is used");
+    }
    std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+                                   framework::StringToDataLayout(data_format),
-        framework::StringToDataLayout(data_format), library_);
+                                   library_);
  }
 };
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
@@ -77,6 +89,9 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
        "Defaults to \"NHWC\". Specify the data format of the output data, "
        "the input will be transformed automatically. ")
        .SetDefault("AnyLayout");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Softmax Operator.
@@ -119,19 +134,12 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
-    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+    framework::LibraryType library_{framework::LibraryType::kPlain};
-    bool runtime_cudnn_support = false;
 #ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(ctx.GetPlace())) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
-    }
-#endif
-    framework::LibraryType library_ = framework::LibraryType::kPlain;
-    if (use_cudnn && runtime_cudnn_support) {
      library_ = framework::LibraryType::kCUDNN;
    }
+#endif
    std::string data_format = ctx.Attr<std::string>("data_format");
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -10,43 +10,45 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
+#include <unordered_set>
 #include "paddle/fluid/memory/memory.h"
 namespace paddle {
 namespace platform {
 DeviceContextPool* DeviceContextPool::pool = nullptr;
-const platform::DeviceContext* DeviceContextPool::Get(
+platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
-    const platform::Place& place) {
  auto it = device_contexts_.find(place);
  if (it == device_contexts_.end()) {
    PADDLE_THROW(
        "'Place' is not supported, Please re-compile with WITH_GPU "
        "option");
  }
-  return it->second;
+  return it->second.get();
 }
 DeviceContextPool::DeviceContextPool(
    const std::vector<platform::Place>& places) {
  PADDLE_ENFORCE_GT(places.size(), 0);
-  for (size_t i = 0; i < places.size(); i++) {
+  using PtrType = std::unique_ptr<DeviceContext>;
-    if (platform::is_cpu_place(places[i])) {
+  std::unordered_set<Place, PlaceHash> set;
+  for (auto& p : places) {
+    set.insert(p);
+  }
+  for (auto& p : set) {
+    if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
-      device_contexts_.emplace(places[i],
+      device_contexts_.emplace(
-                               new platform::MKLDNNDeviceContext(
+          p, PtrType(new MKLDNNDeviceContext(boost::get<CPUPlace>(p))));
-                                   boost::get<platform::CPUPlace>(places[i])));
 #else
-      device_contexts_.emplace(places[i],
+      device_contexts_.emplace(
-                               new platform::CPUDeviceContext(
+          p, PtrType(new CPUDeviceContext(boost::get<CPUPlace>(p))));
-                                   boost::get<platform::CPUPlace>(places[i])));
 #endif
-    } else if (platform::is_gpu_place(places[i])) {
+    } else if (platform::is_gpu_place(p)) {
 #ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(places[i],
+      device_contexts_.emplace(
-                               new platform::CUDADeviceContext(
+          p, PtrType(new CUDADeviceContext(boost::get<CUDAPlace>(p))));
-                                   boost::get<platform::CUDAPlace>(places[i])));
 #else
      PADDLE_THROW(
          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
@@ -159,6 +161,7 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 void CUDADeviceContext::Wait() const {
+  std::lock_guard<std::mutex> guard(mutex_);
  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
  PADDLE_ENFORCE(cudaGetLastError());
 }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -103,6 +103,7 @@ class CUDADeviceContext : public DeviceContext {
  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
+  mutable std::mutex mutex_;
  cudaStream_t stream_;
  cudnnHandle_t cudnn_handle_;
  cublasHandle_t cublas_handle_;
@@ -159,7 +160,7 @@ class DeviceContextPool {
  }
  /*! \brief  Return handle of single device context. */
-  const platform::DeviceContext* Get(const platform::Place& place);
+  platform::DeviceContext* Get(const platform::Place& place);
  template <typename Place>
  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
@@ -172,19 +173,8 @@ class DeviceContextPool {
 private:
  static DeviceContextPool* pool;
-  constexpr static int LEFT_SHIFT = 8;
+  std::unordered_map<const platform::Place,
-  struct Hash {
+                     std::unique_ptr<platform::DeviceContext>, PlaceHash>
-    std::hash<int> hash_;
-    size_t operator()(const platform::Place& place) const {
-      int pre_hash = place.which() << LEFT_SHIFT;
-      if (platform::is_gpu_place(place)) {
-        pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
-      }
-      return hash_(pre_hash);
-    }
-  };
-  std::unordered_map<const platform::Place, const platform::DeviceContext*,
-                     Hash>
      device_contexts_;
  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
 };

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -65,6 +65,18 @@ bool is_cpu_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
+struct PlaceHash {
+  std::size_t operator()(const Place &p) const {
+    constexpr size_t num_dev_bits = 4;
+    std::hash<int> ihash;
+    size_t dev_id = 0;
+    if (is_gpu_place(p)) {
+      dev_id = boost::get<CUDAPlace>(p).device;
+    }
+    return ihash(dev_id << num_dev_bits | p.which());
+  }
+};
 std::ostream &operator<<(std::ostream &, const Place &);
 template <typename Visitor>

--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -131,7 +131,7 @@ def make_channel(dtype, capacity=0):
    return channel
-def channel_send(channel, value, copy=False):
+def channel_send(channel, value, is_copy=False):
    """
    Sends a value through a channel variable. Used by an unbuffered or buffered
    channel to pass data from within or to a concurrent Go block, where
@@ -141,8 +141,8 @@ def channel_send(channel, value, copy=False):
        channel (Variable|Channel): Channel variable created using
        `make_channel`.
        value (Variable): Value to send to channel
-        copy (bool): Copy data while channel send. If False, then data
+        is_copy (bool): Copy data while channel send. If False, then data
-        is moved. The input cannot be used after move.
+        is moved. The input cannot be used after move. (default False)
    Returns:
        Variable: The boolean status on whether or not the channel
                  successfully sent the passed value.
@@ -166,7 +166,7 @@ def channel_send(channel, value, copy=False):
    X = value
-    if copy is True:
+    if is_copy is True:
        copied_X = helper.create_variable(
            name=unique_name.generate(value.name + '_copy'),
            type=value.type,

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -918,6 +918,24 @@ class Block(object):
                name=v.name)
            self.vars[new_p.name] = new_p
+    def clone_variable(self, var):
+        """
+        Clone a variable into current block.
+        Args:
+            var: the variable to be cloned.
+        Returns:
+            The new  variable cloned from 'var' in current block.
+        """
+        assert isinstance(var, Variable)
+        return self.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=True)
 class Program(object):
    def __init__(self):
@@ -960,14 +978,14 @@ class Program(object):
        """Clone the Program object
        Set for_test to False when we want to clone the program for training.
-        Set for_test to True when we want to clone the program for testing.         
+        Set for_test to True when we want to clone the program for testing.
        Args:
            for_test(bool): Some operators, such as batch_norm and drop_out ops,
                behave differently in training and testing. If for_test is True,
                the is_test attributes in these operators will be set to True for
-                testing purposes, otherwise, they remain unchanged.  
+                testing purposes, otherwise, they remain unchanged.
        Returns(Program):
            The cloned Program object.
        """

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -399,6 +399,9 @@ class LayerHelper(object):
        if isinstance(act, basestring):
            act = {'type': act}
        tmp = self.create_tmp_variable(dtype=input_var.dtype)
+        if 'use_mkldnn' in self.kwargs:
+            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
        act_type = act.pop('type')
        self.append_op(
            type=act_type,

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -82,6 +82,7 @@ def fc(input,
       num_flatten_dims=1,
       param_attr=None,
       bias_attr=None,
+       use_mkldnn=False,
       act=None,
       name=None):
    """
@@ -163,8 +164,11 @@ def fc(input,
            inputs={"X": input_var,
                    "Y": w},
            outputs={"Out": tmp},
-            attrs={"x_num_col_dims": num_flatten_dims,
+            attrs={
-                   "y_num_col_dims": 1})
+                "x_num_col_dims": num_flatten_dims,
+                "y_num_col_dims": 1,
+                'use_mkldnn': use_mkldnn
+            })
        mul_results.append(tmp)
    # sum

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -69,6 +69,7 @@ __all__ = [
    'gaussian_random_batch_size_like',
    'cumsum',
    'scatter',
+    'sum',
 ] + __activations__
 for _OP in set(__all__):

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from collections import defaultdict
+from paddle.fluid.framework import Program
 import framework
 import layers
 from backward import append_backward
@@ -23,9 +23,11 @@ from initializer import Constant
 from layer_helper import LayerHelper
 from regularizer import append_regularization_ops
 from clip import append_gradient_clip_ops, error_clip_callback
+from contextlib import contextmanager
 __all__ = [
-    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Adadelta'
+    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
+    'Adadelta', 'ModelAverage'
 ]
@@ -121,7 +123,12 @@ class Optimizer(object):
        """
        pass
-    def _add_accumulator(self, name, param, dtype=None, fill_value=0.0):
+    def _add_accumulator(self,
+                         name,
+                         param,
+                         dtype=None,
+                         fill_value=0.0,
+                         shape=None):
        """Utility function to add an accumulator for a parameter
        Args:
@@ -135,17 +142,19 @@ class Optimizer(object):
                param.name in self._accumulators[name]):
            raise Exception("Accumulator {} already exists for parameter {}".
                            format(name, param.name))
+        if shape == None:
+            shape = param.shape
        assert isinstance(self.helper, LayerHelper)
        var = self.helper.create_global_variable(
            name=unique_name.generate(name),
            persistable=True,
            dtype=dtype or param.dtype,
            type=param.type,
-            shape=param.shape)
+            shape=shape)
        self.helper.set_variable_initializer(
            var, initializer=Constant(value=float(fill_value)))
        self._accumulators[name][param.name] = var
+        return var
    def _get_accumulator(self, name, param):
        """Utility function to fetch an accumulator for a parameter
@@ -797,3 +806,143 @@ Adamax = AdamaxOptimizer
 DecayedAdagrad = DecayedAdagradOptimizer
 Adadelta = AdadeltaOptimizer
 RMSProp = RMSPropOptimizer
+class ModelAverage(Optimizer):
+    """Accumulate the average of parameters whtin sliding window. The average
+    result will be saved in temporary variables which can be applied to
+    parameter variables of current model by calling 'apply()' method. And the
+    'restore()' method is used to restored the parameter values of current model.
+    The size of average window is determined by average_window_rate,
+    min_average_window, max_average_window and current update times.
+    Args:
+        params_grads: A list of parameter-grad variable pairs.
+        average_window_rate: The rate of average window.
+        min_average_window: The minimum size of average window.
+        max_average_window: The maximum size of average window.
+    Examples:
+        ...
+        optimizer = fluid.optimizer.Momentum()
+        _, params_grads = optimizer.minimize(cost)
+        model_average = fluid.optimizer.ModelAverage(params_grads, 0.15,
+                                                min_average_window=10000,
+                                                max_average_window=20000)
+        for pass_id in range(args.pass_num):
+            for data in train_reader():
+                exe.run(fluid.default_main_program()...)
+            with model_average.apply(exe):
+                for data in test_reader():
+                    exe.run(inference_program...)
+    """
+    def __init__(self,
+                 params_grads,
+                 average_window_rate,
+                 min_average_window=10000,
+                 max_average_window=10000,
+                 **kwargs):
+        super(ModelAverage, self).__init__(0.0, **kwargs)
+        self.average_window = average_window_rate
+        self.min_average_window = min_average_window
+        self.max_average_window = max_average_window
+        self.params_grads = params_grads
+        for param, grad in self.params_grads:
+            if grad is not None:
+                self._append_average_accumulate_op(param)
+        self.apply_program = Program()
+        block = self.apply_program.global_block()
+        with program_guard(main_program=self.apply_program):
+            for param_grad in self.params_grads:
+                if param_grad[1] is not None:
+                    self._add_average_apply_op(block, param_grad)
+        self.restore_program = Program()
+        block = self.restore_program.global_block()
+        with program_guard(main_program=self.restore_program):
+            for param_grad in self.params_grads:
+                if param_grad[1] is not None:
+                    self._add_average_restore_op(block, param_grad)
+    def _add_average_apply_op(self, block, param_grad):
+        param = block.clone_variable(param_grad[0])
+        grad = block.clone_variable(param_grad[1])
+        sum_1 = block.clone_variable(self._get_accumulator('sum_1', param))
+        sum_2 = block.clone_variable(self._get_accumulator('sum_2', param))
+        sum_3 = block.clone_variable(self._get_accumulator('sum_3', param))
+        num_accumulates = block.clone_variable(
+            self._get_accumulator('num_accumulates', param))
+        old_num_accumulates = block.clone_variable(
+            self._get_accumulator('old_num_accumulates', param))
+        num_updates = block.clone_variable(
+            self._get_accumulator('num_updates', param))
+        # backup param value to grad
+        layers.assign(input=param, output=grad)
+        # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
+        tmp = layers.sum(x=[num_accumulates, old_num_accumulates])
+        sum = layers.sum(x=[sum_1, sum_2, sum_3])
+        tmp = layers.cast(x=tmp, dtype='float32')
+        sum = layers.cast(x=sum, dtype='float32')
+        layers.elementwise_div(x=sum, y=tmp, out=param)
+    def _add_average_restore_op(self, block, param_grad):
+        param = block.clone_variable(param_grad[0])
+        grad = block.clone_variable(param_grad[1])
+        layers.assign(input=grad, output=param)
+    def _append_average_accumulate_op(self, param):
+        self.helper = LayerHelper("average_accumulate")
+        sum_1 = self._add_accumulator('sum_1', param)
+        sum_2 = self._add_accumulator('sum_2', param)
+        sum_3 = self._add_accumulator('sum_3', param)
+        num_accumulates = self._add_accumulator(
+            'num_accumulates', param, dtype='int64', shape=[1])
+        old_num_accumulates = self._add_accumulator(
+            'old_num_accumulates', param, dtype='int64', shape=[1])
+        num_updates = self._add_accumulator(
+            'num_updates', param, dtype='int64', shape=[1])
+        self.helper.append_op(
+            type='average_accumulates',
+            inputs={
+                "param": param,
+                "in_sum_1": sum_1,
+                "in_sum_2": sum_2,
+                "in_sum_3": sum_3,
+                "in_num_accumulates": num_accumulates,
+                "in_old_num_accumulates": old_num_accumulates,
+                "in_num_updates": num_updates
+            },
+            outputs={
+                "out_sum_1": sum_1,
+                "out_sum_2": sum_2,
+                "out_sum_3": sum_3,
+                "out_num_accumulates": num_accumulates,
+                "out_old_num_accumulates": old_num_accumulates,
+                "out_num_updates": num_updates,
+            },
+            attrs={
+                "average_window": self.average_window,
+                "min_average_window": self.min_average_window,
+                "max_average_window": self.max_average_window,
+            })
+    @contextmanager
+    def apply(self, executor, need_restore=True):
+        """Apply average values to parameters of current model.
+        """
+        executor.run(self.apply_program)
+        try:
+            yield
+        finally:
+            if need_restore:
+                self.restore(executor)
+    def restore(self, executor):
+        """Restore parameter values of current model.
+        """
+        executor.run(self.restore_program)
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -14,6 +14,7 @@
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 from scipy.special import expit
@@ -212,18 +213,39 @@ class TestRound(OpTest):
 class TestRelu(OpTest):
    def setUp(self):
        self.op_type = "relu"
-        x = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
        # The same reason with TestAbs
        x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
+        out = np.maximum(x, 0)
-        self.outputs = {'Out': np.maximum(self.inputs['X'], 0)}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    def init_dtype(self):
+        pass
+class TestFP16Relu(TestRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestBRelu(OpTest):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -29,15 +29,20 @@ class TestSoftmaxOp(OpTest):
    def setUp(self):
        self.op_type = "softmax"
        self.use_cudnn = False
-        self.inputs = {
+        self.use_mkldnn = False
-            'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
+        self.dtype = np.float32
-        }
+        self.init_kernel_type()
-        self.outputs = {
-            'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
+        x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, 1, x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
        }
-        self.attrs = {'use_cudnn': self.use_cudnn, }
-    def init_op_type(self):
+    def init_kernel_type(self):
        pass
    def test_check_output(self):
@@ -48,6 +53,8 @@ class TestSoftmaxOp(OpTest):
            self.check_output()
    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
        if self.use_cudnn:
            place = core.CUDAPlace(0)
            self.check_grad_with_place(
@@ -57,8 +64,25 @@ class TestSoftmaxOp(OpTest):
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
-    def init_op_type(self):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
        self.use_cudnn = True
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
 if __name__ == "__main__":