diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b6f767869aaa800c122c8e7a06a1413e48e10e0
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -0,0 +1,145 @@
+# Fluid 分布式版本使用指南
+本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行，以及将单机训练脚本改造成支持集群训练的版本
+
+## 准备工作
+* 可用的集群
+
+    包含一个或多个计算节点的集群，每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址，集群内的所有计算节点可以通过网络相互通信。
+* 安装PaddlePaddle Fluid with Distribution版本
+
+    所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。
+
+    **注意：**当前对外提供的PaddlePaddle版本并不支持分布式，需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。
+    cmake编译命令中需要将WITH_DISTRIBUTE设置为ON，下面是一个cmake编译指令示例：
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
+## 更新训练脚本
+这里，我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例，描述如何将单机训练脚本改造成支持集群训练的版本。
+### 单机训练脚本示例
+```python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+我们创建了一个简单的全连接神经网络程序，并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。
+### 介绍Parameter Server
+在非分布式版本的训练脚本中，只存在Trainer一种角色，它不仅处理常规的计算任务，也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中，由于存在多个Trainer节点进行同样的数据计算任务，因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中，我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md)
+
+**因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
+
+### 分布式训练 
+Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
+```python
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) 
+```
+将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
+```python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+... #create executor
+
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+```
+### 分布式训练脚本运行说明
+分布式任务的运行需要将表格中说明的多个参数进行赋值:
+
+| 参数名 | 值类型 | 说明 | 示例 |
+|:-------------|:------|:---------------------------------------|:-------------|
+| trainer_id | int | 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 | 0/1/2/3 |
+| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
+| trainers | int | 训练节点的总个数，>0的数字 | 4 |
+| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
+| training_role | str | 节点角色， TRAINER/PSERVER | PSERVER |
+
+**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下： 
+
+```python
+t = fluid.DistributeTranspiler()
+t.transpile(
+    optimize_ops,
+    params_grads,
+    trainer_id,
+    pservers=pserver,
+    trainers=trainers)
+if training_role == "PSERVER":
+    pserver_prog = t.get_pserver_program(server_endpoint)
+    pserver_startup = t.get_startup_program(server_endpoint, pserver_prog)
+```
+
+### Demo
+完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。
+
+第一步，进入demo代码所在目录：
+```bash
+cd /paddle/python/paddle/fluid/tests/book
+```
+
+第二步，启动Parameter Server：
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+```
+执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
+
+第三步，启动Trainer：
+```bash
+PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+```
+由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。
+
+现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。
diff --git a/doc/v2/faq/index_en.rst b/doc/v2/faq/index_en.rst
index 57df868f760038b25fae30df7ab20a68875ad36a..3fa220792b252617848a1c76bc2be49928e35f64 100644
--- a/doc/v2/faq/index_en.rst
+++ b/doc/v2/faq/index_en.rst
@@ -1,7 +1,8 @@
 FAQ
 ====
 
- 
+This document provides answers to some of the frequently asked questions about PaddlePaddle. If you have a question that is not covered here, please go to `PaddlePaddle Community <https://github.com/PaddlePaddle/Paddle/issues>`_ , to find an answer or submit new `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_  , we will reply in time.
+
 ..  toctree::
   :maxdepth: 1
 
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index d74c47b981e51f12d99098818c71f3f6ec455d98..ec637658c03ad94624ee9a4f5def6a84387d293e 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -613,3 +613,14 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
                                 ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
+
+REGISTER_OP_CPU_KERNEL(relu,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ReluFunctor<float>>,
+                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
+                                             ops::ReluFunctor<double>>);
+REGISTER_OP_CPU_KERNEL(
+    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                         ops::ReluGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
+                              ops::ReluGradFunctor<double>>);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index b2633d017623c3a6a3bab2b416009d6d7c8fc1d4..7709a551dc155e1f3cd2a19a689999608f497beb 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 
@@ -31,3 +32,16 @@ namespace ops = paddle::operators;
                                 ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
+
+REGISTER_OP_CUDA_KERNEL(
+    relu, ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                                ops::ReluFunctor<float>>,
+    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                          ops::ReluFunctor<double>>,
+    ops::ActivationKernel<paddle::platform::CUDADeviceContext,
+                          ops::ReluFunctor<paddle::platform::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    relu_grad, ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                         ops::ReluGradFunctor<float>>,
+    ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::ReluGradFunctor<double>>);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 8f791a6ca81c13a92fd8adf0d1620203bd4cf7d6..b95e793586219b7c413d0c7adb835081874d9363 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -772,7 +772,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
   __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
   __macro(exp, ExpFunctor, ExpGradFunctor);                          \
-  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
   __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
   __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
   __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c95077fcbdb6b6c0da31f30b795dbe4d7d4fe6fe
--- /dev/null
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/average_accumulates_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <>
+void GetAccumulators<paddle::platform::CPUDeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t& num_updates_,
+    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
+
+  old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
+  num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
+  num_updates_ = in_num_updates->data<int64_t>()[0];
+}
+
+template <>
+void SetAccumulators<paddle::platform::CPUDeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t num_updates_,
+    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
+  out_num_updates->data<int64_t>()[0] = num_updates_;
+}
+
+class AverageAccumulatesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("param"),
+        "Input (param) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_1"),
+        "Input (sum_1) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_2"),
+        "Input (sum_2) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_3"),
+        "Input (sum_3) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_num_accumulates"),
+        "Input (in_num_accumulates) of average_accumulates op should "
+        "not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
+                   "Input (old_num_accumulates) of average_accumulates op "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_num_updates"),
+        "Input (num_updates) of average_accumulates op should not be null.");
+
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_1"),
+        "Output (sum_1) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_2"),
+        "Output (sum_2) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_3"),
+        "Output (sum_3) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
+                   "Output (num_accumulates) of average_accumulates op should "
+                   "not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
+                   "Output (old_num_accumulates) of average_accumulates op "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_num_updates"),
+        "Output (num_updates) of average_accumulates op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("param");
+
+    ctx->SetOutputDim("out_sum_1", in_dim);
+    ctx->SetOutputDim("out_sum_2", in_dim);
+    ctx->SetOutputDim("out_sum_3", in_dim);
+    ctx->SetOutputDim("out_num_accumulates", {1});
+    ctx->SetOutputDim("out_old_num_accumulates", {1});
+    ctx->SetOutputDim("out_num_updates", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("param")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("param", "(Tensor), The parameter to be accumulated.");
+    AddInput("in_sum_1",
+             "(Tensor), A tensor used to store the parameter "
+             "sums with the same shape as input(param).");
+    AddInput("in_sum_2",
+             "(Tensor), A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param). It is used to avoid loss of precision due to too "
+             "many sums.");
+    AddInput("in_sum_3",
+             "(Tensor), A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param).");
+    AddInput("in_num_accumulates",
+             "(Tensor<int64_t>), The accumulating times of current window with "
+             "shape [1].");
+    AddInput(
+        "in_old_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of previous window with "
+        "shape [1].");
+    AddInput("in_num_updates",
+             "(Tensor<int64_t>), The total number of batches used by trainning "
+             "before this batch with shape [1].");
+
+    AddOutput("out_sum_1",
+              "(Tensor), A tensor used to store the "
+              "parameter sums with the same shape as input(param).");
+    AddOutput("out_sum_2",
+              "(Tensor), A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param). It is used to avoid loss of precision due to too "
+              "many sums.");
+    AddOutput("out_sum_3",
+              "(Tensor), A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param).");
+    AddOutput(
+        "out_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of current window with "
+        "shape [1].");
+    AddOutput(
+        "out_old_num_accumulates",
+        "(Tensor<int64_t>) The accumulating times of previous window with "
+        "shape [1].");
+    AddOutput(
+        "out_num_updates",
+        "(Tensor<int64_t>), The total number of batches used by trainning "
+        "before this batch with shape [1].");
+
+    AddAttr<float>("average_window",
+                   "(float, default 0) "
+                   "The rate of average window size relative to num_updates.")
+        .SetDefault(0);
+    AddAttr<int64_t>("max_average_window",
+                     "(int64_t) "
+                     "Maximum size of average window. It suggests that the "
+                     "number of mini-batches "
+                     "in one pass is appropriate value to set.");
+    AddAttr<int64_t>("min_average_window",
+                     "(int64_t, default 10000L) "
+                     "Minimu size of average window.")
+        .SetDefault(10000L);
+
+    AddComment(R"DOC(
+AverageAccumulates Operator.
+Accumulate the sum of parameter whtin sliding window. The size of sliding window is
+determined by 'average_window', 'max_average_window' and 'min_average_window'.
+Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
+'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
+
+All the accumulators were inited to zero before training.
+
+And for a mini-batch in training, accumulators were computed as below steps:
+    num_updates += 1
+    num_accumulates += 1
+    sum_1 += param
+    if num_updates % kMaxNumAccumulates == 0:
+        sum_2 += sum_1
+        sum_1 = 0
+    if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
+        sum_3 = sum_1 + sum_2
+        sum_1 = 0
+        sum_2 = 0
+        old_num_accumulates = num_accumulates
+        num_accumulates = 0
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
+                  ops::AverageAccumulatesOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    average_accumulates,
+    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..270c46984465e5ca62eaa8da3955ce7a3eaa0c57
--- /dev/null
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/average_accumulates_op.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+template <>
+void GetAccumulators<paddle::platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t& num_updates_,
+    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
+  auto stream = ctx.cuda_device_context().stream();
+  memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
+               platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
+               sizeof(int64_t), stream);
+  memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
+               in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
+  memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
+               in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
+}
+
+template <>
+void SetAccumulators<paddle::platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t num_updates_,
+    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+  auto stream = ctx.cuda_device_context().stream();
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+
+  memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
+               platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
+               platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
+               platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    average_accumulates,
+    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f858109d1428dc67d94c253e5a39818eb2d4560d
--- /dev/null
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext>
+void GetAccumulators(const framework::ExecutionContext& ctx,
+                     int64_t& num_updates, int64_t& num_accumulates,
+                     int64_t& old_num_accumulates);
+
+template <typename DeviceContext>
+void SetAccumulators(const framework::ExecutionContext& ctx,
+                     int64_t num_updates, int64_t num_accumulates,
+                     int64_t old_num_accumulates);
+
+template <typename DeviceContext, typename T>
+class AverageAccumulatesKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // It is used to avoid loss of precision
+    static const int64_t kMaxNumAccumulates = 16384;
+    // Get accumulators from input
+    int64_t num_updates = 0;
+    int64_t num_accumulates = 0;
+    int64_t old_num_accumulates = 0;
+    GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+                                   old_num_accumulates);
+
+    // Get attrs
+    float average_window = ctx.Attr<float>("average_window");
+    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
+    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
+    min_average_window =
+        std::min<int64_t>(min_average_window, max_average_window);
+
+    // Get inputs
+    auto* param = ctx.Input<Tensor>("param");
+    auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
+    auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
+    auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
+    auto param_tensor = EigenVector<T>::Flatten(*param);
+    auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
+    auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
+    auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
+
+    // Get outputs
+    auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
+    auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
+    auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+    auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
+    auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
+    auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
+
+    // Compute
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    math::SetConstant<DeviceContext, T> constant_functor;
+    ++num_updates;
+    ++num_accumulates;
+    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
+    out_sum_2_tensor.device(place) = in_sum_2_tensor;
+    out_sum_3_tensor.device(place) = in_sum_3_tensor;
+    if (num_updates % kMaxNumAccumulates == 0) {
+      // Move the sum to a different buffer to avoid loss of precision due to
+      // too many sums.
+      out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
+                       0.0);
+    }
+    if (num_accumulates >= min_average_window &&
+        num_accumulates >= std::min<int64_t>(max_average_window,
+                                             num_updates * average_window)) {
+      //  Now the average window is too long, discard the old sum.
+      out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
+                       0.0);
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
+                       0.0);
+      old_num_accumulates = num_accumulates;
+      num_accumulates = 0;
+    }
+
+    // Set accumulators to output
+    SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+                                   old_num_accumulates);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index ec315695a68befc2e3de798fdb3fa146a903aaff..6da3a24dc89a85fe432b6350d3af7b0e84337c9d 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -78,7 +78,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
       for (int64_t i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
         int64_t index = i * class_num + label_data[i];
-        dx_data[index] = -dy_data[i] / x_data[index];
+        dx_data[index] = math::TolerableValue<T>()(-dy_data[i] / x_data[index]);
       }
     }
   }
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 34ea6a91ce7743462d378cf471a5ec3a12ca51d1..5518ebed3f792a5acdfbb27976bc2c6dbd78069a 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -89,6 +89,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       XGrad->mutable_data<T>(context.GetPlace())));
 }
 
+template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<float>;
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 47cb336d87f8627d86ac33d6ac32c04d5d93f753..5596fa0648ccc151bc0d11de9c556599428a8d71 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -56,7 +56,9 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(softmax, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::SoftmaxCUDNNKernel<float>);
-REGISTER_OP_KERNEL(softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
+namespace plat = paddle::platform;
+REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
+                   ops::SoftmaxCUDNNKernel<float>,
+                   ops::SoftmaxCUDNNKernel<plat::float16>);
+REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
                    ops::SoftmaxGradCUDNNKernel<float>);
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf0244e8662e827a90d8472a097315680579ff6d
--- /dev/null
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+#include <iostream>
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::MKLDNNMemDesc;
+
+using mkldnn::memory;  // Note: paddle has also "memory" namespace
+using mkldnn::primitive;
+using mkldnn::softmax_forward;
+using mkldnn::prop_kind;
+using mkldnn::stream;
+
+template <typename T>
+class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* input = ctx.Input<Tensor>("X");
+    Tensor* output = ctx.Output<Tensor>("Out");
+    PADDLE_ENFORCE(input->dims().size() == 2UL,
+                   "The input of softmax op must be a 2D matrix.");
+    const T* input_data = input->data<T>();
+    // allocate memory for output
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
+    // we will make normalization after final eg. axis: 1
+    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
+                   "Softmax input and output dimensions should match");
+    // Same memory descriptor to be used for input and output
+    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
+    // Currently only supports NC data format
+    // TODO(jczaja-intel): support more formats
+    auto softmax_md =
+        MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
+                                              softmax_md, 1 /*dim: C*/);
+    // create memory primitives
+    auto softmax_src_memory =
+        memory({softmax_md, mkldnn_engine}, (void*)input_data);
+    auto softmax_dst_memory =
+        memory({softmax_md, mkldnn_engine}, (void*)output_data);
+    auto softmax_prim_desc =
+        softmax_forward::primitive_desc(softmax_desc, mkldnn_engine);
+    auto softmax = softmax_forward(softmax_prim_desc, softmax_src_memory,
+                                   softmax_dst_memory);
+    std::vector<primitive> pipeline{softmax};
+    stream(stream::kind::eager).submit(pipeline).wait();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::SoftmaxMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 1b63f8a499e5d20d2f10c3cd1024d1bcf78764d4..e2c0f915d96b7746191572fa27b725d90cb6e2e5 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -13,7 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_op.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
 
@@ -38,26 +44,32 @@ class SoftmaxOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
-    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-    bool runtime_cudnn_support = false;
+    framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
     }
 #endif
-    framework::LibraryType library_ = framework::LibraryType::kPlain;
-    if (use_cudnn && runtime_cudnn_support) {
-      library_ = framework::LibraryType::kCUDNN;
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
     }
+#endif
+
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
+                        "float16 can only be used when CUDNN is used");
+    }
+
     std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        framework::StringToDataLayout(data_format), library_);
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                   framework::StringToDataLayout(data_format),
+                                   library_);
   }
 };
-
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
@@ -77,6 +89,9 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
         "Defaults to \"NHWC\". Specify the data format of the output data, "
         "the input will be transformed automatically. ")
         .SetDefault("AnyLayout");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Softmax Operator.
 
@@ -119,19 +134,12 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
-    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-    bool runtime_cudnn_support = false;
+    framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
-      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
-    }
-#endif
-    framework::LibraryType library_ = framework::LibraryType::kPlain;
-    if (use_cudnn && runtime_cudnn_support) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
+#endif
     std::string data_format = ctx.Attr<std::string>("data_format");
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 98b4178177b0a8bafd6fe34a92be2a07a2fbc5a7..59b76a1edb5ec5900520fbccb6a6f8f6e7a70aa4 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -10,43 +10,45 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device_context.h"
+#include <unordered_set>
 #include "paddle/fluid/memory/memory.h"
-
 namespace paddle {
 namespace platform {
 
 DeviceContextPool* DeviceContextPool::pool = nullptr;
 
-const platform::DeviceContext* DeviceContextPool::Get(
-    const platform::Place& place) {
+platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
     PADDLE_THROW(
         "'Place' is not supported, Please re-compile with WITH_GPU "
         "option");
   }
-  return it->second;
+  return it->second.get();
 }
 
 DeviceContextPool::DeviceContextPool(
     const std::vector<platform::Place>& places) {
   PADDLE_ENFORCE_GT(places.size(), 0);
-  for (size_t i = 0; i < places.size(); i++) {
-    if (platform::is_cpu_place(places[i])) {
+  using PtrType = std::unique_ptr<DeviceContext>;
+  std::unordered_set<Place, PlaceHash> set;
+  for (auto& p : places) {
+    set.insert(p);
+  }
+
+  for (auto& p : set) {
+    if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
-      device_contexts_.emplace(places[i],
-                               new platform::MKLDNNDeviceContext(
-                                   boost::get<platform::CPUPlace>(places[i])));
+      device_contexts_.emplace(
+          p, PtrType(new MKLDNNDeviceContext(boost::get<CPUPlace>(p))));
 #else
-      device_contexts_.emplace(places[i],
-                               new platform::CPUDeviceContext(
-                                   boost::get<platform::CPUPlace>(places[i])));
+      device_contexts_.emplace(
+          p, PtrType(new CPUDeviceContext(boost::get<CPUPlace>(p))));
 #endif
-    } else if (platform::is_gpu_place(places[i])) {
+    } else if (platform::is_gpu_place(p)) {
 #ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(places[i],
-                               new platform::CUDADeviceContext(
-                                   boost::get<platform::CUDAPlace>(places[i])));
+      device_contexts_.emplace(
+          p, PtrType(new CUDADeviceContext(boost::get<CUDAPlace>(p))));
 #else
       PADDLE_THROW(
           "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
@@ -159,6 +161,7 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
+  std::lock_guard<std::mutex> guard(mutex_);
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
   PADDLE_ENFORCE(cudaGetLastError());
 }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 603b890af13b529c490c29112a73a09cc815d07a..202394c7be7e103a609dd0999fc883c794ef0edd 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -103,6 +103,7 @@ class CUDADeviceContext : public DeviceContext {
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
 
+  mutable std::mutex mutex_;
   cudaStream_t stream_;
   cudnnHandle_t cudnn_handle_;
   cublasHandle_t cublas_handle_;
@@ -159,7 +160,7 @@ class DeviceContextPool {
   }
 
   /*! \brief  Return handle of single device context. */
-  const platform::DeviceContext* Get(const platform::Place& place);
+  platform::DeviceContext* Get(const platform::Place& place);
 
   template <typename Place>
   const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
@@ -172,19 +173,8 @@ class DeviceContextPool {
 
  private:
   static DeviceContextPool* pool;
-  constexpr static int LEFT_SHIFT = 8;
-  struct Hash {
-    std::hash<int> hash_;
-    size_t operator()(const platform::Place& place) const {
-      int pre_hash = place.which() << LEFT_SHIFT;
-      if (platform::is_gpu_place(place)) {
-        pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
-      }
-      return hash_(pre_hash);
-    }
-  };
-  std::unordered_map<const platform::Place, const platform::DeviceContext*,
-                     Hash>
+  std::unordered_map<const platform::Place,
+                     std::unique_ptr<platform::DeviceContext>, PlaceHash>
       device_contexts_;
   DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
 };
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 501bddfc6ec8b5d0bf554b0911c32e47fd51ec15..4cc8b377b8b671eb5a446ecbae21ba9628fbd2c8 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -65,6 +65,18 @@ bool is_cpu_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
 
+struct PlaceHash {
+  std::size_t operator()(const Place &p) const {
+    constexpr size_t num_dev_bits = 4;
+    std::hash<int> ihash;
+    size_t dev_id = 0;
+    if (is_gpu_place(p)) {
+      dev_id = boost::get<CUDAPlace>(p).device;
+    }
+    return ihash(dev_id << num_dev_bits | p.which());
+  }
+};
+
 std::ostream &operator<<(std::ostream &, const Place &);
 
 template <typename Visitor>
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index 0fc4981a8e9da09f15e6d0a5e5c6761e01328876..3e4292d23550b853ea73de787a1c053e1f2c80fd 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -131,7 +131,7 @@ def make_channel(dtype, capacity=0):
     return channel
 
 
-def channel_send(channel, value, copy=False):
+def channel_send(channel, value, is_copy=False):
     """
     Sends a value through a channel variable. Used by an unbuffered or buffered
     channel to pass data from within or to a concurrent Go block, where
@@ -141,8 +141,8 @@ def channel_send(channel, value, copy=False):
         channel (Variable|Channel): Channel variable created using
         `make_channel`.
         value (Variable): Value to send to channel
-        copy (bool): Copy data while channel send. If False, then data
-        is moved. The input cannot be used after move.
+        is_copy (bool): Copy data while channel send. If False, then data
+        is moved. The input cannot be used after move. (default False)
     Returns:
         Variable: The boolean status on whether or not the channel
                   successfully sent the passed value.
@@ -166,7 +166,7 @@ def channel_send(channel, value, copy=False):
 
     X = value
 
-    if copy is True:
+    if is_copy is True:
         copied_X = helper.create_variable(
             name=unique_name.generate(value.name + '_copy'),
             type=value.type,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 70ecffd910a46570b5a8e576d88039fa5e22e726..3e78788f470556d2196b5104f69a0a3285543ec4 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -918,6 +918,24 @@ class Block(object):
                 name=v.name)
             self.vars[new_p.name] = new_p
 
+    def clone_variable(self, var):
+        """
+        Clone a variable into current block.
+        Args:
+            var: the variable to be cloned.
+
+        Returns:
+            The new  variable cloned from 'var' in current block.
+        """
+        assert isinstance(var, Variable)
+        return self.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=True)
+
 
 class Program(object):
     def __init__(self):
@@ -960,14 +978,14 @@ class Program(object):
         """Clone the Program object
 
         Set for_test to False when we want to clone the program for training.
-        Set for_test to True when we want to clone the program for testing.         
+        Set for_test to True when we want to clone the program for testing.
 
         Args:
             for_test(bool): Some operators, such as batch_norm and drop_out ops,
                 behave differently in training and testing. If for_test is True,
                 the is_test attributes in these operators will be set to True for
-                testing purposes, otherwise, they remain unchanged.  
-                
+                testing purposes, otherwise, they remain unchanged.
+
         Returns(Program):
             The cloned Program object.
         """
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index da7e74c901e1f5be709c5f9d73f048bfda0c5549..58b668227168c5c5e080f3928035ad98303bbae9 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -399,6 +399,9 @@ class LayerHelper(object):
         if isinstance(act, basestring):
             act = {'type': act}
         tmp = self.create_tmp_variable(dtype=input_var.dtype)
+
+        if 'use_mkldnn' in self.kwargs:
+            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
         act_type = act.pop('type')
         self.append_op(
             type=act_type,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 2ce68f95057f7820d7ab59ba2b41171c7ecd3654..679de6ce2aa67abe1322702fcb371eded0130698 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -82,6 +82,7 @@ def fc(input,
        num_flatten_dims=1,
        param_attr=None,
        bias_attr=None,
+       use_mkldnn=False,
        act=None,
        name=None):
     """
@@ -163,8 +164,11 @@ def fc(input,
             inputs={"X": input_var,
                     "Y": w},
             outputs={"Out": tmp},
-            attrs={"x_num_col_dims": num_flatten_dims,
-                   "y_num_col_dims": 1})
+            attrs={
+                "x_num_col_dims": num_flatten_dims,
+                "y_num_col_dims": 1,
+                'use_mkldnn': use_mkldnn
+            })
         mul_results.append(tmp)
 
     # sum
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index d7bad221c5fa7b18137bf317125195267437a644..f5c6b47d243dcf4ba985cfb41fc23b44d3ed809f 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -69,6 +69,7 @@ __all__ = [
     'gaussian_random_batch_size_like',
     'cumsum',
     'scatter',
+    'sum',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index a33760a528f667b7afabafa19762eca7d1ef0635..180575c35dc6e115e11cccf9fff9fb2d3cd7e9a6 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from collections import defaultdict
-
+from paddle.fluid.framework import Program
 import framework
 import layers
 from backward import append_backward
@@ -23,9 +23,11 @@ from initializer import Constant
 from layer_helper import LayerHelper
 from regularizer import append_regularization_ops
 from clip import append_gradient_clip_ops, error_clip_callback
+from contextlib import contextmanager
 
 __all__ = [
-    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Adadelta'
+    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
+    'Adadelta', 'ModelAverage'
 ]
 
 
@@ -121,7 +123,12 @@ class Optimizer(object):
         """
         pass
 
-    def _add_accumulator(self, name, param, dtype=None, fill_value=0.0):
+    def _add_accumulator(self,
+                         name,
+                         param,
+                         dtype=None,
+                         fill_value=0.0,
+                         shape=None):
         """Utility function to add an accumulator for a parameter
 
         Args:
@@ -135,17 +142,19 @@ class Optimizer(object):
                 param.name in self._accumulators[name]):
             raise Exception("Accumulator {} already exists for parameter {}".
                             format(name, param.name))
-
+        if shape == None:
+            shape = param.shape
         assert isinstance(self.helper, LayerHelper)
         var = self.helper.create_global_variable(
             name=unique_name.generate(name),
             persistable=True,
             dtype=dtype or param.dtype,
             type=param.type,
-            shape=param.shape)
+            shape=shape)
         self.helper.set_variable_initializer(
             var, initializer=Constant(value=float(fill_value)))
         self._accumulators[name][param.name] = var
+        return var
 
     def _get_accumulator(self, name, param):
         """Utility function to fetch an accumulator for a parameter
@@ -797,3 +806,143 @@ Adamax = AdamaxOptimizer
 DecayedAdagrad = DecayedAdagradOptimizer
 Adadelta = AdadeltaOptimizer
 RMSProp = RMSPropOptimizer
+
+
+class ModelAverage(Optimizer):
+    """Accumulate the average of parameters whtin sliding window. The average
+    result will be saved in temporary variables which can be applied to
+    parameter variables of current model by calling 'apply()' method. And the
+    'restore()' method is used to restored the parameter values of current model.
+
+    The size of average window is determined by average_window_rate,
+    min_average_window, max_average_window and current update times.
+
+    Args:
+        params_grads: A list of parameter-grad variable pairs.
+        average_window_rate: The rate of average window.
+        min_average_window: The minimum size of average window.
+        max_average_window: The maximum size of average window.
+
+    Examples:
+        ...
+        optimizer = fluid.optimizer.Momentum()
+        _, params_grads = optimizer.minimize(cost)
+        model_average = fluid.optimizer.ModelAverage(params_grads, 0.15,
+                                                min_average_window=10000,
+                                                max_average_window=20000)
+        for pass_id in range(args.pass_num):
+            for data in train_reader():
+                exe.run(fluid.default_main_program()...)
+
+            with model_average.apply(exe):
+                for data in test_reader():
+                    exe.run(inference_program...)
+    """
+
+    def __init__(self,
+                 params_grads,
+                 average_window_rate,
+                 min_average_window=10000,
+                 max_average_window=10000,
+                 **kwargs):
+        super(ModelAverage, self).__init__(0.0, **kwargs)
+        self.average_window = average_window_rate
+        self.min_average_window = min_average_window
+        self.max_average_window = max_average_window
+        self.params_grads = params_grads
+        for param, grad in self.params_grads:
+            if grad is not None:
+                self._append_average_accumulate_op(param)
+
+        self.apply_program = Program()
+        block = self.apply_program.global_block()
+        with program_guard(main_program=self.apply_program):
+            for param_grad in self.params_grads:
+                if param_grad[1] is not None:
+                    self._add_average_apply_op(block, param_grad)
+
+        self.restore_program = Program()
+        block = self.restore_program.global_block()
+        with program_guard(main_program=self.restore_program):
+            for param_grad in self.params_grads:
+                if param_grad[1] is not None:
+                    self._add_average_restore_op(block, param_grad)
+
+    def _add_average_apply_op(self, block, param_grad):
+        param = block.clone_variable(param_grad[0])
+        grad = block.clone_variable(param_grad[1])
+        sum_1 = block.clone_variable(self._get_accumulator('sum_1', param))
+        sum_2 = block.clone_variable(self._get_accumulator('sum_2', param))
+        sum_3 = block.clone_variable(self._get_accumulator('sum_3', param))
+        num_accumulates = block.clone_variable(
+            self._get_accumulator('num_accumulates', param))
+        old_num_accumulates = block.clone_variable(
+            self._get_accumulator('old_num_accumulates', param))
+        num_updates = block.clone_variable(
+            self._get_accumulator('num_updates', param))
+        # backup param value to grad
+        layers.assign(input=param, output=grad)
+        # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
+        tmp = layers.sum(x=[num_accumulates, old_num_accumulates])
+        sum = layers.sum(x=[sum_1, sum_2, sum_3])
+        tmp = layers.cast(x=tmp, dtype='float32')
+        sum = layers.cast(x=sum, dtype='float32')
+        layers.elementwise_div(x=sum, y=tmp, out=param)
+
+    def _add_average_restore_op(self, block, param_grad):
+        param = block.clone_variable(param_grad[0])
+        grad = block.clone_variable(param_grad[1])
+        layers.assign(input=grad, output=param)
+
+    def _append_average_accumulate_op(self, param):
+        self.helper = LayerHelper("average_accumulate")
+        sum_1 = self._add_accumulator('sum_1', param)
+        sum_2 = self._add_accumulator('sum_2', param)
+        sum_3 = self._add_accumulator('sum_3', param)
+        num_accumulates = self._add_accumulator(
+            'num_accumulates', param, dtype='int64', shape=[1])
+        old_num_accumulates = self._add_accumulator(
+            'old_num_accumulates', param, dtype='int64', shape=[1])
+        num_updates = self._add_accumulator(
+            'num_updates', param, dtype='int64', shape=[1])
+
+        self.helper.append_op(
+            type='average_accumulates',
+            inputs={
+                "param": param,
+                "in_sum_1": sum_1,
+                "in_sum_2": sum_2,
+                "in_sum_3": sum_3,
+                "in_num_accumulates": num_accumulates,
+                "in_old_num_accumulates": old_num_accumulates,
+                "in_num_updates": num_updates
+            },
+            outputs={
+                "out_sum_1": sum_1,
+                "out_sum_2": sum_2,
+                "out_sum_3": sum_3,
+                "out_num_accumulates": num_accumulates,
+                "out_old_num_accumulates": old_num_accumulates,
+                "out_num_updates": num_updates,
+            },
+            attrs={
+                "average_window": self.average_window,
+                "min_average_window": self.min_average_window,
+                "max_average_window": self.max_average_window,
+            })
+
+    @contextmanager
+    def apply(self, executor, need_restore=True):
+        """Apply average values to parameters of current model.
+        """
+        executor.run(self.apply_program)
+        try:
+            yield
+        finally:
+            if need_restore:
+                self.restore(executor)
+
+    def restore(self, executor):
+        """Restore parameter values of current model.
+        """
+        executor.run(self.restore_program)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index eab41ebe711bd21bdc3b34ca83ab57388cc35ba2..1e3decfbaf0691e912b96b415b68353e626cf51e 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 from scipy.special import expit
 
@@ -212,18 +213,39 @@ class TestRound(OpTest):
 class TestRelu(OpTest):
     def setUp(self):
         self.op_type = "relu"
-        x = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         # The same reason with TestAbs
         x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.maximum(self.inputs['X'], 0)}
+        out = np.maximum(x, 0)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Relu(TestRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestBRelu(OpTest):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 4f20da2b926823db9e7ec92c95178b6d3d1feec9..33d60c7e31ce0817ad26ea1c1c974339936052d3 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -29,15 +29,20 @@ class TestSoftmaxOp(OpTest):
     def setUp(self):
         self.op_type = "softmax"
         self.use_cudnn = False
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
-        }
-        self.outputs = {
-            'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
+        self.use_mkldnn = False
+        self.dtype = np.float32
+        self.init_kernel_type()
+
+        x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, 1, x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
         }
-        self.attrs = {'use_cudnn': self.use_cudnn, }
 
-    def init_op_type(self):
+    def init_kernel_type(self):
         pass
 
     def test_check_output(self):
@@ -48,6 +53,8 @@ class TestSoftmaxOp(OpTest):
             self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         if self.use_cudnn:
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
@@ -57,8 +64,25 @@ class TestSoftmaxOp(OpTest):
 
 
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
-    def init_op_type(self):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+
+
+class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
         self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
+
+class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
 
 
 if __name__ == "__main__":