Merge remote-tracking branch 'upstream/develop' into windows/build

dfbac603 · peizhilin · 7c8c9dc9 · dd6fd4c7 · dfbac603 · dfbac603
16 changed file
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -25,6 +25,7 @@
 | kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
+| cjld | Dun Liang |
 | lipeng-unisound | Peng Li |
 | liuyuan | Yuan Liu |
 | livc | Zhao Li |

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -103,6 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
+paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None))
 paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/group_norm_op.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+class GroupNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
+                   "Output(Mean) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
+                   "Output(Variance) of GroupNormOp should not be null.");
+    auto x_dim = ctx->GetInputDim("X");
+    auto channel_num = x_dim[1];
+    auto batch_size = x_dim[0];
+    auto groups = ctx->Attrs().Get<int>("groups");
+    PADDLE_ENFORCE_LE(
+        groups, channel_num,
+        "'groups' must be less equal than the number of channels.");
+    PADDLE_ENFORCE_GE(groups, 1, "'groups' must be greater equal than 1.");
+    if (ctx->HasInput("Scale")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], channel_num);
+    }
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], channel_num);
+    }
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Mean", {batch_size, groups});
+    ctx->SetOutputDim("Variance", {batch_size, groups});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+class GroupNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("Scale",
+             "Scale is a 1-dimensional tensor of size C"
+             "that is applied to the output.")
+        .AsDispensable();
+    AddInput("Bias",
+             "Bias is a 1-dimensional tensor of size C "
+             "that is applied to the output")
+        .AsDispensable();
+    AddOutput("Y", "Result after normalization.");
+    AddOutput("Mean", "Mean of each group.").AsIntermediate();
+    AddOutput("Variance", "Variance of each group.").AsIntermediate();
+    AddAttr<float>("epsilon",
+                   "Constant for numerical stability [default 1e-5].")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 1.0f,
+                         "'epsilon' should be between 0.0 and 1.0.");
+        });
+    AddAttr<int>("groups", "The number of groups that divided from channels.")
+        .AddCustomChecker([](const int &groups) {
+          PADDLE_ENFORCE_GT(groups, 0, "'groups' should be greater than zero.");
+        });
+    AddComment(R"DOC(
+Group Normalization
+Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_
+)DOC");
+  }
+};
+class GroupNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                   "Input(Mean) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                   "Input(Variance) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) of GroupNormOp should not be null.");
+    // check output
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    group_norm_grad,
+    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cub/cub.cuh>
+#include "paddle/fluid/operators/group_norm_op.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+__global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C,
+                                              int imsize, int groups,
+                                              int group_size, T* mean, T* var) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = 0, x_var = 0;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T val = x[(bid * C + ccid) * imsize + imid];
+    x_mean += val;
+    x_var += val * val;
+  }
+  x_mean /= number * imsize;
+  x_var /= number * imsize;
+  __shared__ T s_mem[2];
+  if (threadIdx.x == 0) {
+    s_mem[0] = s_mem[1] = 0;
+  }
+  __syncthreads();
+  paddle::platform::CudaAtomicAdd(&s_mem[0], x_mean);
+  paddle::platform::CudaAtomicAdd(&s_mem[1], x_var);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    paddle::platform::CudaAtomicAdd(&mean[bid * groups + gid], s_mem[0]);
+    paddle::platform::CudaAtomicAdd(&var[bid * groups + gid], s_mem[1]);
+  }
+}
+template <typename T>
+__global__ void GroupNormForward(const T* x, const T* mean, const T* var,
+                                 const T* scale, const T* bias, int N, int C,
+                                 int imsize, int groups, int group_size,
+                                 T epsilon, T* y, T* real_var) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  x_var = x_var - x_mean * x_mean;
+  T var_inv = 1.0 / sqrt(x_var + epsilon);
+  if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T val = x[(bid * C + ccid) * imsize + imid];
+    val = (val - x_mean) * var_inv;
+    if (scale) val *= scale[gid * group_size + cid];
+    if (bias) val += bias[gid * group_size + cid];
+    y[(bid * C + ccid) * imsize + imid] = val;
+  }
+}
+template <typename T>
+class GroupNormKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
+    const auto groups = ctx.Attr<int>("groups");
+    const auto x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    Tensor temp_var;
+    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
+    set_zero(dev_ctx, mean, static_cast<T>(0));
+    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+    auto* x_data = x->data<T>();
+    auto* y_data = y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    auto* temp_var_data = temp_var.data<T>();
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
+    int imsize = x_dims[2] * x_dims[3];
+    int block_size = std::min(512, imsize);
+    dim3 grid(group_size, groups, x_dims[0]);
+    dim3 threads(block_size, 1, 1);
+    GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, x_dims[0], x_dims[1], imsize, groups, group_size, mean_data,
+        temp_var_data);
+    GroupNormForward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, temp_var_data, scale_data, bias_data, x_dims[0],
+        x_dims[1], imsize, groups, group_size, epsilon, y_data, var_data);
+  }
+};
+template <typename T>
+__global__ void GroupNormBackwardGetMeanAndVar(
+    const T* x, const T* mean, const T* var, const T* scale, const T* d_y,
+    int N, int C, int imsize, int groups, int group_size, T epsilon, T* d_x,
+    T* d_mean, T* d_var, T* d_scale, T* d_bias) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  T var_inv = 1.0 / sqrt(x_var + epsilon);
+  T d_var_inv = 0, d_x_mean = 0;
+  T d_mean_data = 0, d_var_data = 0, d_scale_data = 0, d_bias_data = 0;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T tmp = x[(bid * C + ccid) * imsize + imid];
+    T val = (tmp - x_mean) * var_inv;
+    T dval = d_y[(bid * C + ccid) * imsize + imid];
+    if (d_bias) d_bias_data += dval;
+    if (d_scale) d_scale_data += val * dval;
+    if (scale) dval = dval * scale[ccid];
+    d_var_data += (tmp - x_mean) * dval;
+    T d_tmp = dval * var_inv;
+    if (d_x) d_x[(bid * C + ccid) * imsize + imid] = d_tmp;
+    d_mean_data -= d_tmp;
+  }
+  __shared__ T s_mem[4];
+  if (threadIdx.x == 0) {
+    s_mem[0] = s_mem[1] = 0;
+    if (d_scale) s_mem[2] = 0;
+    if (d_bias) s_mem[3] = 0;
+  }
+  __syncthreads();
+  paddle::platform::CudaAtomicAdd(&s_mem[0], d_mean_data);
+  paddle::platform::CudaAtomicAdd(&s_mem[1], d_var_data);
+  if (d_scale) paddle::platform::CudaAtomicAdd(&s_mem[2], d_scale_data);
+  if (d_bias) paddle::platform::CudaAtomicAdd(&s_mem[3], d_bias_data);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    paddle::platform::CudaAtomicAdd(&d_mean[bid * groups + gid], s_mem[0]);
+    paddle::platform::CudaAtomicAdd(&d_var[bid * groups + gid], s_mem[1]);
+    if (d_scale) paddle::platform::CudaAtomicAdd(&d_scale[ccid], s_mem[2]);
+    if (d_bias) paddle::platform::CudaAtomicAdd(&d_bias[ccid], s_mem[3]);
+  }
+}
+template <typename T>
+__global__ void GroupNormBackward(const T* x, const T* mean, const T* var,
+                                  const T* d_mean, const T* d_var, int N, int C,
+                                  int imsize, int groups, int group_size,
+                                  T epsilon, T* d_x) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  T d_x_mean = d_mean[bid * groups + gid];
+  T d_var_inv = d_var[bid * groups + gid];
+  T d_x_var =
+      -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
+  d_x_mean -= 2 * d_x_var * x_mean;
+  d_x_var /= number * imsize;
+  d_x_mean /= number * imsize;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T tmp = x[(bid * C + ccid) * imsize + imid];
+    if (d_x)
+      d_x[(bid * C + ccid) * imsize + imid] += d_x_mean + tmp * 2 * d_x_var;
+  }
+}
+template <typename T>
+class GroupNormGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* var = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto groups = ctx.Attr<int>("groups");
+    // init output
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const auto& x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+    T* d_x_data = nullptr;
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      d_x_data = d_x->data<T>();
+    }
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    Tensor temp_var;
+    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
+    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+    T* temp_var_data = temp_var.data<T>();
+    Tensor temp_mean;
+    temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
+    set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
+    T* temp_mean_data = temp_mean.data<T>();
+    auto* x_data = x->data<T>();
+    auto* y_data = d_y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    T* d_scale_data = nullptr;
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      d_scale_data = d_scale->data<T>();
+    }
+    T* d_bias_data = nullptr;
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      d_bias_data = d_bias->data<T>();
+    }
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+    int imsize = x_dims[2] * x_dims[3];
+    int block_size = std::min(512, imsize);
+    dim3 grid(group_size, groups, x_dims[0]);
+    dim3 threads(block_size, 1, 1);
+    GroupNormBackwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, var_data, scale_data, y_data, x_dims[0], x_dims[1],
+        imsize, groups, group_size, epsilon, d_x_data, temp_mean_data,
+        temp_var_data, d_scale_data, d_bias_data);
+    GroupNormBackward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, var_data, temp_mean_data, temp_var_data, x_dims[0],
+        x_dims[1], imsize, groups, group_size, epsilon, d_x_data);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    group_norm,
+    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    group_norm_grad,
+    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+template <typename DeviceContext, typename T>
+class GroupNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
+    const auto groups = ctx.Attr<int>("groups");
+    const auto x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+    auto* x_data = x->data<T>();
+    auto* y_data = y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
+    int imsize = x_dims[2] * x_dims[3];
+    auto* iter_x_data = x_data;
+    auto* iter_y_data = y_data;
+    for (int bid = 0; bid < x_dims[0]; bid++)
+      for (int gid = 0; gid < groups; gid++) {
+        T x_mean = 0, x_var = 0;
+        int number = std::min(group_size,
+                              static_cast<int>(x_dims[1] - gid * group_size));
+        auto* tmp = iter_x_data;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize; imid++, iter_x_data++) {
+            x_mean += iter_x_data[0];
+            x_var += iter_x_data[0] * iter_x_data[0];
+          }
+        }
+        x_mean /= number * imsize;
+        x_var /= number * imsize;
+        x_var = x_var - x_mean * x_mean;
+        T var_inv = 1.0 / sqrt(x_var + epsilon);
+        mean_data[bid * groups + gid] = x_mean;
+        var_data[bid * groups + gid] = x_var;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize; imid++, tmp++, iter_y_data++) {
+            T val = (tmp[0] - x_mean) * var_inv;
+            if (scale_data) val *= scale_data[gid * group_size + cid];
+            if (bias_data) val += bias_data[gid * group_size + cid];
+            iter_y_data[0] = val;
+          }
+        }
+      }
+  }
+};
+template <typename DeviceContext, typename T>
+class GroupNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* var = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto groups = ctx.Attr<int>("groups");
+    // init output
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const auto& x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+    // TODO(liangdun): need to check d_x is null
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    T* d_x_data = nullptr;
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_x, static_cast<T>(0));
+      d_x_data = d_x->data<T>();
+    }
+    auto* x_data = x->data<T>();
+    auto* y_data = d_y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    T* d_scale_data = nullptr;
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      d_scale_data = d_scale->data<T>();
+    }
+    T* d_bias_data = nullptr;
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      d_bias_data = d_bias->data<T>();
+    }
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+    int imsize = x_dims[2] * x_dims[3];
+    auto* iter_x_data = x_data;
+    auto* iter_d_x_data = d_x_data;
+    auto* iter_y_data = y_data;
+    for (int bid = 0; bid < x_dims[0]; bid++)
+      for (int gid = 0; gid < groups; gid++) {
+        T x_mean = mean_data[bid * groups + gid];
+        T x_var = var_data[bid * groups + gid];
+        T var_inv = 1.0 / sqrt(x_var + epsilon);
+        int number = std::min(group_size,
+                              static_cast<int>(x_dims[1] - gid * group_size));
+        auto* tmp = iter_x_data;
+        auto* tmp2 = iter_d_x_data;
+        T d_var_inv = 0, d_x_mean = 0;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize;
+               imid++, tmp++, iter_y_data++, iter_d_x_data++) {
+            T val = (tmp[0] - x_mean) * var_inv;
+            T dval = iter_y_data[0];
+            if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
+            if (d_scale_data)
+              d_scale_data[gid * group_size + cid] += val * dval;
+            if (scale_data) dval = scale_data[gid * group_size + cid] * dval;
+            d_var_inv += (tmp[0] - x_mean) * dval;
+            T d_tmp = dval * var_inv;
+            if (d_x_data) iter_d_x_data[0] += d_tmp;
+            d_x_mean -= d_tmp;
+          }
+        }
+        T d_x_var =
+            -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
+        d_x_mean -= 2 * d_x_var * x_mean;
+        d_x_var /= number * imsize;
+        d_x_mean /= number * imsize;
+        iter_d_x_data = tmp2;
+        if (d_x_data) {
+          for (int cid = 0; cid < number; cid++) {
+            for (int imid = 0; imid < imsize;
+                 imid++, iter_x_data++, iter_d_x_data++) {
+              iter_d_x_data[0] += d_x_mean;
+              iter_d_x_data[0] += iter_x_data[0] * 2 * d_x_var;
+            }
+          }
+        }
+      }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <string>
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -37,6 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
@@ -86,6 +87,9 @@ bool IsCompiledWithDIST() {
 }
 PYBIND11_PLUGIN(core) {
+  // Not used, just make sure cpu_info.cc is linked.
+  paddle::platform::CpuTotalPhysicalMemory();
  paddle::memory::allocation::UseAllocatorStrategyGFlag();
  py::module m("core", "C++ core of PaddlePaddle");

--- a/python/paddle/fluid/contrib/utils/__init__.py
+++ b/python/paddle/fluid/contrib/utils/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+from . import hdfs_utils
+from .hdfs_utils import *
+__all__ = hdfs_utils.__all__
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HDFS Utils"""
+import os
+import subprocess
+import multiprocessing
+from datetime import datetime
+import re
+import copy
+import errno
+import logging
+__all__ = ["HDFSClient", "multi_download"]
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+_logger = logging.getLogger("hdfs_utils")
+_logger.setLevel(logging.INFO)
+class HDFSClient(object):
+    def __init__(self, hadoop_home, configs):
+        self.pre_commands = []
+        hadoop_bin = '%s/bin/hadoop' % hadoop_home
+        self.pre_commands.append(hadoop_bin)
+        dfs = 'fs'
+        self.pre_commands.append(dfs)
+        for k, v in configs.iteritems():
+            config_command = '-D%s=%s' % (k, v)
+            self.pre_commands.append(config_command)
+    def __run_hdfs_cmd(self, commands, retry_times=5):
+        whole_commands = copy.deepcopy(self.pre_commands)
+        whole_commands.extend(commands)
+        print('Running system command: {0}'.format(' '.join(whole_commands)))
+        ret_code = 0
+        ret_out = None
+        ret_err = None
+        for x in range(retry_times + 1):
+            proc = subprocess.Popen(
+                whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            (output, errors) = proc.communicate()
+            ret_code, ret_out, ret_err = proc.returncode, output, errors
+            if ret_code:
+                _logger.warn(
+                    'Times: %d, Error running command: %s. Return code: %d, Error: %s'
+                    % (x, ' '.join(whole_commands), proc.returncode, errors))
+            else:
+                break
+        return ret_code, ret_out, ret_err
+    def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
+        """
+            upload the local file to hdfs
+            args:
+                local_file_path: the local file path
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+            return:
+                True or False
+        """
+        assert hdfs_path is not None
+        assert local_path is not None and os.path.exists(local_path)
+        if os.path.isdir(local_path):
+            _logger.warn(
+                "The Local path: {} is dir and I will support it later, return".
+                format(local_path))
+            return
+        base = os.path.basename(local_path)
+        if not self.is_exist(hdfs_path):
+            self.makedirs(hdfs_path)
+        else:
+            if self.is_exist(os.path.join(hdfs_path, base)):
+                if overwrite:
+                    _logger.error(
+                        "The HDFS path: {} is exist and overwrite is True, delete it".
+                        format(hdfs_path))
+                    self.delete(hdfs_path)
+                else:
+                    _logger.error(
+                        "The HDFS path: {} is exist and overwrite is False, return".
+                        format(hdfs_path))
+                    return False
+        put_commands = ["-put", local_path, hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(put_commands,
+                                                         retry_times)
+        if returncode:
+            _logger.error("Put local path: {} to HDFS path: {} failed".format(
+                local_path, hdfs_path))
+            return False
+        else:
+            _logger.info("Put local path: {} to HDFS path: {} successfully".
+                         format(local_path, hdfs_path))
+            return True
+    def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
+        """
+            download from hdfs
+            args:
+                local_file_path: the local file path
+                remote_file_path: remote dir on hdfs
+            return:
+                True or False
+        """
+        _logger.info('Downloading %r to %r.', hdfs_path, local_path)
+        _logger.info('Download of %s to %r complete.', hdfs_path, local_path)
+        if not self.is_exist(hdfs_path):
+            print("HDFS path: {} do not exist".format(hdfs_path))
+            return False
+        if self.is_dir(hdfs_path):
+            _logger.error(
+                "The HDFS path: {} is dir and I will support it later, return".
+                format(hdfs_path))
+        if os.path.exists(local_path):
+            base = os.path.basename(hdfs_path)
+            local_file = os.path.join(local_path, base)
+            if os.path.exists(local_file):
+                if overwrite:
+                    os.remove(local_file)
+                else:
+                    _logger.error(
+                        "The Local path: {} is exist and overwrite is False, return".
+                        format(local_file))
+                    return False
+        self.make_local_dirs(local_path)
+        download_commands = ["-get", hdfs_path, local_path]
+        returncode, output, errors = self.__run_hdfs_cmd(download_commands)
+        if returncode:
+            _logger.error("Get local path: {} from HDFS path: {} failed".format(
+                local_path, hdfs_path))
+            return False
+        else:
+            _logger.info("Get local path: {} from HDFS path: {} successfully".
+                         format(local_path, hdfs_path))
+            return True
+    def is_exist(self, hdfs_path=None):
+        """
+            whether the remote hdfs path exists?
+            args:
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+                fs_name: The default values are the same as in the job configuration
+                fs_ugi: The default values are the same as in the job configuration
+            return:
+                True or False
+        """
+        exist_cmd = ['-test', '-e', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            exist_cmd, retry_times=1)
+        if returncode:
+            _logger.error("HDFS is_exist HDFS path: {} failed".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS is_exist HDFS path: {} successfully".format(
+                hdfs_path))
+            return True
+    def is_dir(self, hdfs_path=None):
+        """
+            whether the remote hdfs path exists?
+            args:
+                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
+                fs_name: The default values are the same as in the job configuration
+                fs_ugi: The default values are the same as in the job configuration
+            return:
+                True or False
+        """
+        if not self.is_exist(hdfs_path):
+            return False
+        dir_cmd = ['-test', '-d', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
+        if returncode:
+            _logger.error("HDFS path: {} failed is not a directory".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS path: {} successfully is a directory".format(
+                hdfs_path))
+            return True
+    def delete(self, hdfs_path):
+        """Remove a file or directory from HDFS.
+        :param hdfs_path: HDFS path.
+        :param recursive: Recursively delete files and directories. By default,
+          this method will raise an :class:`HdfsError` if trying to delete a
+          non-empty directory.
+        This function returns `True` if the deletion was successful and `False` if
+        no file or directory previously existed at `hdfs_path`.
+        """
+        _logger.info('Deleting %r.', hdfs_path)
+        if not self.is_exist(hdfs_path):
+            _logger.warn("HDFS path: {} do not exist".format(hdfs_path))
+            return True
+        if self.is_dir(hdfs_path):
+            del_cmd = ['-rmr', hdfs_path]
+        else:
+            del_cmd = ['-rm', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
+        if returncode:
+            _logger.error("HDFS path: {} delete files failure".format(
+                hdfs_path))
+            return False
+        else:
+            _logger.info("HDFS path: {} delete files successfully".format(
+                hdfs_path))
+            return True
+    def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
+        """Move a file or folder.
+        :param hdfs_src_path: Source path.
+        :param hdfs_dst_path: Destination path. If the path already exists and is
+          a directory, the source will be moved into it. If the path exists and is
+          a file, or if a parent destination directory is missing, this method will
+          raise an :class:`HdfsError`.
+        """
+        assert hdfs_src_path is not None
+        assert hdfs_dst_path is not None
+        if not self.is_exist(hdfs_src_path):
+            _logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
+        if self.is_exist(hdfs_dst_path) and not overwrite:
+            _logger.error("HDFS path is exist: {} and overwrite=False".format(
+                hdfs_dst_path))
+        rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            rename_command, retry_times=1)
+        if returncode:
+            _logger.error("HDFS rename path: {} to {} failed".format(
+                hdfs_src_path, hdfs_dst_path))
+            return False
+        else:
+            _logger.info("HDFS rename path: {} to {} successfully".format(
+                hdfs_src_path, hdfs_dst_path))
+            return True
+    @staticmethod
+    def make_local_dirs(local_path):
+        try:
+            os.makedirs(local_path)
+        except OSError as e:
+            if e.errno != errno.EEXIST:
+                raise
+    def makedirs(self, hdfs_path):
+        """Create a remote directory, recursively if necessary.
+        :param hdfs_path: Remote path. Intermediate directories will be created
+          appropriately.
+        """
+        _logger.info('Creating directories to %r.', hdfs_path)
+        assert hdfs_path is not None
+        if self.is_exist(hdfs_path):
+            return
+        mkdirs_commands = ['-mkdir', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            mkdirs_commands, retry_times=1)
+        if returncode:
+            _logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
+            return False
+        else:
+            _logger.error("HDFS mkdir path: {} successfully".format(hdfs_path))
+            return True
+    def ls(self, hdfs_path):
+        assert hdfs_path is not None
+        if not self.is_exist(hdfs_path):
+            return []
+        ls_commands = ['-ls', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            ls_commands, retry_times=1)
+        if returncode:
+            _logger.error("HDFS list path: {} failed".format(hdfs_path))
+            return []
+        else:
+            _logger.info("HDFS list path: {} successfully".format(hdfs_path))
+            ret_lines = []
+            regex = re.compile('\s+')
+            out_lines = output.strip().split("\n")
+            for line in out_lines:
+                re_line = regex.split(line)
+                if len(re_line) == 8:
+                    ret_lines.append(re_line[7])
+            return ret_lines
+    def lsr(self, hdfs_path, only_file=True, sort=True):
+        def sort_by_time(v1, v2):
+            v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
+            v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
+            return v1_time > v2_time
+        assert hdfs_path is not None
+        if not self.is_exist(hdfs_path):
+            return []
+        ls_commands = ['-lsr', hdfs_path]
+        returncode, output, errors = self.__run_hdfs_cmd(
+            ls_commands, retry_times=1)
+        if returncode:
+            _logger.error("HDFS list all files: {} failed".format(hdfs_path))
+            return []
+        else:
+            _logger.info("HDFS list all files: {} successfully".format(
+                hdfs_path))
+            lines = []
+            regex = re.compile('\s+')
+            out_lines = output.strip().split("\n")
+            for line in out_lines:
+                re_line = regex.split(line)
+                if len(re_line) == 8:
+                    if only_file and re_line[0][0] == "d":
+                        continue
+                    else:
+                        lines.append(
+                            (re_line[7], re_line[5] + " " + re_line[6]))
+            if sort:
+                sorted(lines, cmp=sort_by_time)
+            ret_lines = [ret[0] for ret in lines]
+            return ret_lines
+def multi_upload(client,
+                 hdfs_path,
+                 local_path,
+                 multi_processes=5,
+                 overwrite=False):
+    """
+    :param overwrite: will overwrite hdfs file or not
+    :param multi_processes: the upload data process at the same time, default=5
+    :param client: instance of HDFSClient
+    :param hdfs_path: path on hdfs
+    :param local_path: path on local
+    :return:
+    """
+    def __subprocess_upload(datas):
+        for data in datas:
+            re_path = os.path.relpath(os.path.dirname(data), local_path)
+            hdfs_re_path = os.path.join(hdfs_path, re_path)
+            client.upload(hdfs_re_path, data, overwrite, retry_times=5)
+    def get_local_files(path):
+        rlist = []
+        if not os.path.isdir(path):
+            return rlist
+        for dirname, folder, files in os.walk(path):
+            for i in files:
+                t = os.path.join(dirname, i)
+                rlist.append(t)
+        return rlist
+    assert isinstance(client, HDFSClient)
+    all_files = get_local_files(local_path)
+    if not all_files:
+        _logger.info("there are nothing need to upload, exit")
+        return
+    _logger.info("Start {} multi process to upload datas".format(
+        multi_processes))
+    procs = []
+    for i in range(multi_processes):
+        process_datas = all_files[i::multi_processes]
+        p = multiprocessing.Process(
+            target=__subprocess_upload, args=(process_datas, ))
+        procs.append(p)
+        p.start()
+    # complete the processes
+    for proc in procs:
+        proc.join()
+    _logger.info("Finish {} multi process to upload datas".format(
+        multi_processes))
+def multi_download(client,
+                   hdfs_path,
+                   local_path,
+                   trainer_id,
+                   trainers,
+                   multi_processes=5):
+    """
+    multi_download
+    :param client: instance of HDFSClient
+    :param hdfs_path: path on hdfs
+    :param local_path: path on local
+    :param trainer_id: current trainer id
+    :param trainers: all trainers number
+    :param multi_processes: the download data process at the same time, default=5
+    :return: None
+    """
+    def __subprocess_download(datas):
+        for data in datas:
+            re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
+            local_re_path = os.path.join(local_path, re_path)
+            client.download(data, local_re_path)
+    assert isinstance(client, HDFSClient)
+    client.make_local_dirs(local_path)
+    _logger.info("Make local dir {} successfully".format(local_path))
+    all_need_download = client.lsr(hdfs_path, sort=True)
+    need_download = all_need_download[trainer_id::trainers]
+    _logger.info("Get {} files From all {} files need to be download from {}".
+                 format(len(need_download), len(all_need_download), hdfs_path))
+    _logger.info("Start {} multi process to download datas".format(
+        multi_processes))
+    procs = []
+    for i in range(multi_processes):
+        process_datas = need_download[i::multi_processes]
+        p = multiprocessing.Process(
+            target=__subprocess_download, args=(process_datas, ))
+        procs.append(p)
+        p.start()
+    # complete the processes
+    for proc in procs:
+        proc.join()
+    _logger.info("Finish {} multi process to download datas".format(
+        multi_processes))
+    local_downloads = []
+    for data in need_download:
+        data_name = os.path.basename(data)
+        re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
+        local_re_path = os.path.join(local_path, re_path, data_name)
+        local_downloads.append(local_re_path)
+    return local_downloads
+if __name__ == "__main__":
+    hadoop_home = "/home/client/hadoop-client/hadoop/"
+    configs = {
+        "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+        "hadoop.job.ugi": "hello,hello123"
+    }
+    client = HDFSClient(hadoop_home, configs)
+    client.ls("/user/com/train-25")
+    files = client.lsr("/user/com/train-25/models")
+    downloads = multi_download(
+        client,
+        "/user/com/train-25/model",
+        "/home/xx/data1",
+        1,
+        5,
+        multi_processes=5)
+    multi_upload(client, "/user/com/train-25/model", "/home/xx/data1")
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -85,6 +85,7 @@ __all__ = [
    'row_conv',
    'multiplex',
    'layer_norm',
+    'group_norm',
    'softmax_with_cross_entropy',
    'smooth_l1',
    'one_hot',
@@ -2547,6 +2548,84 @@ def layer_norm(input,
    return helper.append_activation(layer_norm_out)
+@templatedoc()
+def group_norm(input,
+               groups,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               act=None,
+               data_layout='NCHW',
+               name=None):
+    """
+    **Group Normalization Layer**
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`
+    Args:
+        input(Variable): The input tensor variable.
+        groups(int): The number of groups that divided from channels.
+        epsilon(float): The small value added to the variance to prevent
+            division by zero.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            scale :math:`g`. If it is set to False, no scale will be added to the output units.
+            If it is set to None, the bias is initialized one. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act(str): Activation to be applied to the output of group normalizaiton.
+        data_layout(string|NCHW): Only NCHW is supported.
+        name (str): The name of this layer. It is optional.
+    Returns:
+        Variable: A tensor variable which is the result after applying group normalization on the input.
+    Examples:
+        >>> data = fluid.layers.data(name='data', shape=[8, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.group_norm(input=data, groups=4)
+    """
+    helper = LayerHelper('group_norm', **locals())
+    dtype = helper.input_dtype()
+    # create intput and parameters
+    inputs = {'X': input}
+    input_shape = input.shape
+    if data_layout != 'NCHW':
+        raise ValueError("unsupported data layout:" + data_layout)
+    param_shape = [input_shape[1]]
+    if param_attr:
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0))
+        inputs['Scale'] = scale
+    if bias_attr:
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
+    # create output
+    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    group_norm_out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="group_norm",
+        inputs=inputs,
+        outputs={
+            "Y": group_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "groups": groups})
+    return helper.append_activation(group_norm_out)
 def conv2d_transpose(input,
                     num_filters,
                     output_size=None,

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,11 +23,11 @@ if(NOT WITH_DISTRIBUTE)
    LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)
-if(WITH_GPU)
+if (NOT ${WITH_GPU})
-    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
-    endif()
+elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-endif(WITH_GPU)
+    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+endif()
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
@@ -81,10 +81,12 @@ list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
+list(REMOVE_ITEM TEST_OPS test_interpolate_op)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
+py_test_modules(test_interpolate_op MODULES test_interpolate_op SERIAL)
 if(WITH_DISTRIBUTE)
    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -381,7 +381,7 @@ class OpTest(unittest.TestCase):
            outs.sort(key=len)
            checker(outs)
-    def __assert_is_close(self, numeric_grads, analytic_grads, names,
+    def _assert_is_close(self, numeric_grads, analytic_grads, names,
                         max_relative_error, msg_prefix):
        for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
@@ -451,7 +451,7 @@ class OpTest(unittest.TestCase):
        analytic_grads = self._get_gradient(inputs_to_check, place,
                                            output_names, no_grad_set)
-        self.__assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
+        self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
                              max_relative_error,
                              "Gradient Check On %s" % str(place))

--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from operator import mul
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest
+from testsuite import create_op
+def group_norm_naive(x, scale, bias, epsilon, groups):
+    N, C, H, W = x.shape
+    G = groups
+    x = x.reshape((N * G, -1))
+    mean = np.mean(x, axis=1, keepdims=True)
+    var = np.var(x, axis=1, keepdims=True)
+    output = (x - mean) / np.sqrt(var + epsilon)
+    output = output.reshape((N, C, H, W)) * scale.reshape(
+        (-1, 1, 1)) + bias.reshape((-1, 1, 1))
+    return output, mean.reshape((N, G)), var.reshape((N, G))
+class TestGroupNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "group_norm"
+        self.data_format = "NCHW"
+        self.dtype = np.float32
+        self.shape = (2, 4, 3, 3)
+        self.attrs = {'epsilon': 1e-5, 'groups': 2}
+        self.compare_between_place = False
+        self.init_test_case()
+        input = np.random.random(self.shape).astype(self.dtype)
+        scale = np.random.random([self.shape[1]]).astype(self.dtype)
+        bias = np.random.random([self.shape[1]]).astype(self.dtype)
+        output, mean, var = group_norm_naive(
+            input, scale, bias, self.attrs['epsilon'], self.attrs['groups'])
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(input),
+            'Scale': OpTest.np_dtype_to_fluid_dtype(scale),
+            'Bias': OpTest.np_dtype_to_fluid_dtype(bias)
+        }
+        self.outputs = {'Y': output, 'Mean': mean, 'Variance': var}
+    def test_check_output(self):
+        atol = 1e-4
+        place = core.CPUPlace()
+        self.check_output_with_place(place, atol=atol)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=atol)
+    def do_compare_between_place(self):
+        if not core.is_compiled_with_cuda(): return
+        place = core.CPUPlace()
+        place2 = core.CUDAPlace(0)
+        self.scope = core.Scope()
+        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
+        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
+        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
+        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
+                            op_attrs)
+        inputs_to_check = set(['X', 'Scale', 'Bias'])
+        output_names = 'Y'
+        cpu_grads = self._get_gradient(inputs_to_check, place, output_names,
+                                       None)
+        gpu_grads = self._get_gradient(inputs_to_check, place2, output_names,
+                                       None)
+        self._assert_is_close(cpu_grads, gpu_grads, inputs_to_check, 0.005,
+                              "Gradient Check On %s" % str(place))
+    def test_check_grad(self):
+        if self.compare_between_place:
+            self.do_compare_between_place()
+            return
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, set(['X', 'Scale', 'Bias']), 'Y', max_relative_error=0.01)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['X', 'Scale', 'Bias']),
+                'Y',
+                max_relative_error=0.01)
+    def init_test_case(self):
+        pass
+class TestGroupNormOp1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+class TestGroupNormOp2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+class TestGroupNormOpBigEps1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+        self.attrs['epsilon'] = 0.5
+class TestGroupNormOpBigEps2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+        self.attrs['epsilon'] = 0.5
+class TestGroupNormOpBigEps3(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['epsilon'] = 0.5
+class TestGroupNormOpLargeData(TestGroupNormOp):
+    def init_test_case(self):
+        self.shape = (2, 32, 64, 64)
+        self.attrs['groups'] = 8
+        self.compare_between_place = True
+if __name__ == '__main__':
+    unittest.main()
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -36,17 +36,21 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf
    tar xzf protobuf-cpp-3.1.0.tar.gz && \
    cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
-RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
+RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
    go get github.com/Masterminds/glide && \
    rm -rf /root/requirements.txt
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
 RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz

--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -9,12 +9,12 @@ set -ex
 # remove others to expedite build and reduce docker image size. The original
 # manylinux docker image project builds many python versions.
 # NOTE We added back 3.5.1, since auditwheel requires python 3.3+
-CPYTHON_VERSIONS="2.7.11 3.5.1"
+CPYTHON_VERSIONS="3.7.0 3.6.0 3.5.1 2.7.11"
 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
-OPENSSL_ROOT=openssl-1.0.2l
+OPENSSL_ROOT=openssl-1.1.0i
-OPENSSL_HASH=ce07195b659e75f4e1db43552860070061f156a98bb37b672b101ba6e3ddf30c
+OPENSSL_HASH=ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99
 EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
 DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
 PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
@@ -25,7 +25,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
 # Dependencies for compiling Python that we want to remove from
 # the final image after compiling Python
-PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
+PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"
 # Libraries that are allowed as part of the manylinux1 profile
 MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel"
@@ -61,7 +61,7 @@ yum -y install bzip2 make git patch unzip bison yasm diffutils \
 wget -q https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz && tar xzf cmake-3.5.2.tar.gz && \
 cd cmake-3.5.2 && ./bootstrap && \
-make -j4 && make install && cd .. && rm cmake-3.5.2.tar.gz
+make -j8 && make install && cd .. && rm cmake-3.5.2.tar.gz
 # Install newest autoconf
@@ -77,11 +77,13 @@ mkdir -p /opt/python
 build_cpythons $CPYTHON_VERSIONS
 PY35_BIN=/opt/python/cp35-cp35m/bin
+PY36_BIN=/opt/python/cp36-cp36m/bin
+PY37_BIN=/opt/python/cp37-cp37m/bin
 # NOTE Since our custom manylinux image builds pythons with shared
 # libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
 # python.
 ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib"
 # Our openssl doesn't know how to find the system CA trust store
 #   (https://github.com/pypa/manylinux/issues/53)
@@ -119,9 +121,8 @@ ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
 # final image
 yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
    avahi freetype bitstream-vera-fonts \
-    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1
+    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1 || true
-yum -y install ${MANYLINUX1_DEPS}
+yum -y install ${MANYLINUX1_DEPS} && yum -y clean all > /dev/null 2>&1 || true
-yum -y clean all > /dev/null 2>&1
 yum list installed
 # we don't need libpython*.a, and they're many megabytes
 find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f

--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -52,9 +52,17 @@ function do_cpython_build {
    # NOTE --enable-shared for generating libpython shared library needed for
    # linking of some of the nupic.core test executables.
+    if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
+        # NOTE python 3.7 should be installed via make altinstall rather than
+        # make install, and we should specify the location of ssl
+        CFLAGS="-Wformat" ./configure --prefix=${prefix} --with-openssl=/usr/local/ssl --enable-shared $unicode_flags > /dev/null
+        make -j8 > /dev/null
+        make altinstall > /dev/null
+    else
        CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
-    make -j2 > /dev/null
+        make -j8 > /dev/null
        make install > /dev/null
+    fi
    popd
    echo "ZZZ looking for libpython"
    find / -name 'libpython*.so*'
@@ -64,6 +72,9 @@ function do_cpython_build {
    if [ -e ${prefix}/bin/python3 ]; then
        ln -s python3 ${prefix}/bin/python
    fi
+    if [ -e ${prefix}/bin/python3.7 ]; then
+        ln -s python3.7 ${prefix}/bin/python
+    fi
    # NOTE Make libpython shared library visible to python calls below
    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel