test=develop

bbc818a5 · sneaxiy · 203027ca · 57c90e95 · bbc818a5 · bbc818a5
26 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -174,6 +174,7 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
 paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -41,6 +41,7 @@ pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
+    pass_library(depthwise_conv_mkldnn_pass base)
    pass_library(conv_bias_mkldnn_fuse_pass inference)
    pass_library(conv_relu_mkldnn_fuse_pass inference)
    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
@@ -59,6 +60,7 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 if (WITH_MKLDNN)
+    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
@@ -31,7 +31,8 @@ class ConvReLUFusePass : public FusePassBase {
  virtual ~ConvReLUFusePass() {}

 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };

 }  // namespace ir

--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"

 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"

 namespace paddle {
 namespace framework {
@@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
    op->SetInput("X", inputs);
  }
  op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
 }

 // a->OP0->b

--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_NODE(id, pattern)                               \
+  PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
+                 "pattern has no Node called %s", #id);     \
+  auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+
+std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("depthwise_conv_mkldnn_pass", graph.get());
+  GraphPatternDetector gpd;
+
+  auto* pattern = gpd.mutable_pattern();
+  pattern->NewNode("depthwise_conv")
+      ->assert_is_op("depthwise_conv2d")
+      ->assert_op_attr("use_mkldnn", true);
+
+  int found_depthwise_conv_mkldnn_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
+    GET_NODE(depthwise_conv, (*pattern));
+    depthwise_conv->Op()->SetType("conv2d");
+    found_depthwise_conv_mkldnn_count++;
+  };
+
+  gpd(graph.get(), handler);
+  AddStatis(found_depthwise_conv_mkldnn_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(depthwise_conv_mkldnn_pass,
+              paddle::framework::ir::DepthwiseConvMKLDNNPass);
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class DepthwiseConvMKLDNNPass : public FusePassBase {
+ public:
+  virtual ~DepthwiseConvMKLDNNPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  op->SetInput("Input", {inputs[0]});
+  op->SetInput("Filter", {inputs[1]});
+  op->SetInput("Bias", {inputs[2]});
+  op->SetOutput("Out", outputs);
+}
+
+// (a, weights, bias)->depthwise conv mkldnn->b
+// (b, weights2, bias2)->depthwise conv no mkldnn->c
+// (c, weights3, bias3)->conv mkldnn->d
+// (d, weights3, bias3)->conv no mkldnn->e
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>(
+           {"a", "b", "c", "d", "e", "weights", "bias", "weights2", "bias2",
+            "weights3", "bias3", "weights4", "bias4"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2" ||
+        v == "weights3" || v == "bias3" || v == "weights4" || v == "bias4") {
+      var->SetPersistable(true);
+    }
+  }
+
+  // depthwise conv with MKL-DNN
+  SetOp(&prog, "depthwise_conv2d", "conv1",
+        std::vector<std::string>({"a", "weights", "bias"}),
+        std::vector<std::string>({"b"}), true);
+  // depthwise conv without MKL-DNN
+  SetOp(&prog, "depthwise_conv2d", "conv2",
+        std::vector<std::string>({"b", "weights2", "bias2"}),
+        std::vector<std::string>({"c"}), false);
+  // conv with MKL-DNN
+  SetOp(&prog, "conv2d", "conv3",
+        std::vector<std::string>({"c", "weights3", "bias3"}),
+        std::vector<std::string>({"d"}), true);
+  // conv without MKL-dNN
+  SetOp(&prog, "conv2d", "conv4",
+        std::vector<std::string>({"d", "weights4", "bias4"}),
+        std::vector<std::string>({"e"}), false);
+
+  return prog;
+}
+
+TEST(DepthwiseConvMKLDNNPass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("depthwise_conv_mkldnn_pass");
+
+  struct counters {
+    int mkldnn_depthwise_conv_nodes;
+    int other_depthwise_conv_nodes;
+    int mkldnn_conv_nodes;
+    int other_conv_nodes;
+  };
+
+  counters before{1, 1, 1, 1};
+
+  graph = pass->Apply(std::move(graph));
+
+  // initialize counters before loop
+  counters after{0, 0, 0, 0};
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
+          after.mkldnn_conv_nodes++;
+        else
+          after.other_conv_nodes++;
+      } else if (op->Type() == "depthwise_conv2d") {
+        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
+          after.mkldnn_depthwise_conv_nodes++;
+        else
+          after.other_depthwise_conv_nodes++;
+      }
+    }
+  }
+
+  EXPECT_EQ(after.other_depthwise_conv_nodes,
+            before.other_depthwise_conv_nodes);
+  EXPECT_EQ(after.other_conv_nodes, before.other_conv_nodes);
+  EXPECT_EQ(after.mkldnn_depthwise_conv_nodes,
+            before.mkldnn_depthwise_conv_nodes - 1);
+  EXPECT_EQ(after.mkldnn_conv_nodes, before.mkldnn_conv_nodes + 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(depthwise_conv_mkldnn_pass);
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"

 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"

 namespace paddle {
 namespace framework {
@@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
    op->SetInput("X", inputs);
  }
  op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
 }

 // a->OP0->b

--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -23,8 +23,62 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+namespace {
+
+void CheckProgram(const ProgramDesc &program) {
+  std::map<int, bool> visit;
+#define _INT(role) static_cast<int>(role)
+
+  for (size_t i = 0; i < program.Size(); ++i) {
+    for (OpDesc *op : program.Block(i).AllOps()) {
+      // For backward compatibility, some program doesn't have role added.
+      if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
+      int role_id = boost::get<int>(
+          op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+      visit[role_id] = true;
+      switch (role_id) {
+        case _INT(OpRole::kForward):
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kBackward)) == visit.end(),
+              "Cannot add forward operator before backward operator.");
+          break;
+        case _INT(OpRole::kBackward):
+        case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+              "Cannot add backward operator before optimize operator.");
+          break;
+        case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
+          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
+                                    _INT(OpRole::kLoss)) == visit.end(),
+                         "Cannot add backward|loss operator before "
+                         "forward|loss operator.");
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+              "Cannot add backward operator before optimize operator.");
+          break;
+        case _INT(OpRole::kOptimize):
+        case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
+          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
+                         "Optimize operators must follow backward operator.");
+          break;
+        case _INT(OpRole::kLRSched):
+        case _INT(OpRole::kDist):
+        case _INT(OpRole::kRPC):
+        case _INT(OpRole::kNotSpecified):
+          break;
+        default:
+          LOG(FATAL) << "Unknown operator role. Don't add new role because "
+                        "you don't know what you are doing.";
+      }
+    }
+  }
+#undef _INT
+}
+}  // namespace

 Graph::Graph(const ProgramDesc &program) : program_(program) {
+  CheckProgram(program_);
  // Make the nodes id start from 0.
  Node::ResetId();
  auto var_nodes = InitFromProgram(program_);

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -153,6 +153,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
+  } else if (platform::is_cuda_pinned_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
+                 nullptr);
  }
 #endif
 }

--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -79,6 +79,7 @@ class Analyzer : public OrderedRegistry<PassManager> {
      "conv_bn_fuse_pass",              //
      "conv_eltwiseadd_bn_fuse_pass",   //
 #ifdef PADDLE_WITH_MKLDNN
+      "depthwise_conv_mkldnn_pass",             //
      "conv_bias_mkldnn_fuse_pass",             //
      "conv_relu_mkldnn_fuse_pass",             //
      "conv_elementwise_add_mkldnn_fuse_pass",  //

--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"

@@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type,
  op->SetType(type);
  op->SetInput("Xs", inputs);
  op->SetOutput("Xs", outputs);
+  op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(framework::OpRole::kForward));
 }

 TEST(DataFlowGraph, Build_IR_Graph) {

--- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedSpatialTransformerDescriptor =
+    platform::ScopedSpatialTransformerDescriptor;
+
+template <typename T>
+class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto* theta = ctx.Input<Tensor>("Theta");
+    auto* output = ctx.Output<Tensor>("Output");
+    const T* theta_data = theta->data<T>();
+
+    int n = theta->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    Tensor h_sizes;
+    int* h_size_data;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      h_size_data = h_sizes.data<int>();
+    } else {
+      h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
+      h_size_data[0] = n;
+      h_size_data[1] = size_attr[1];
+      h_size_data[2] = size_attr[2];
+      h_size_data[3] = size_attr[3];
+    }
+
+    T* output_data = output->mutable_data<T>(
+        {n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace());
+    ScopedSpatialTransformerDescriptor st_desc;
+    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
+        st_desc.descriptor<T>(4, h_size_data);
+
+    PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward(
+        handle, cudnn_st_desc, theta_data, output_data));
+  }
+};
+
+template <typename T>
+class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
+
+    int n = output_grad->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    Tensor h_sizes;
+    int* h_size_data;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      h_size_data = h_sizes.data<int>();
+    } else {
+      h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
+      h_size_data[0] = n;
+      h_size_data[1] = size_attr[1];
+      h_size_data[2] = size_attr[2];
+      h_size_data[3] = size_attr[3];
+    }
+
+    ScopedSpatialTransformerDescriptor st_desc;
+    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
+        st_desc.descriptor<T>(4, h_size_data);
+
+    const T* output_grad_data = output_grad->data<T>();
+    T* theta_grad_data = theta_grad->mutable_data<T>(ctx.GetPlace());
+
+    PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward(
+        handle, cudnn_st_desc, output_grad_data, theta_grad_data));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNAffineGridOpKernel<float>,
+                   paddle::operators::CUDNNAffineGridOpKernel<double>);
+REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNAffineGridGradOpKernel<float>,
+                   paddle::operators::CUDNNAffineGridGradOpKernel<double>);
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/affine_grid_op.h"
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct Linspace<paddle::platform::CPUDeviceContext, T> {
+  framework::Tensor operator()(T start, T end, int count,
+                               const framework::ExecutionContext& ctx) {
+    Tensor numbers;
+    T* number_data = numbers.mutable_data<T>({count}, platform::CPUPlace());
+    T slice = (end - start) / (T)(count - 1);
+    for (int i = 0; i < count; ++i) {
+      number_data[i] = start + (T)i * slice;
+    }
+    return numbers;
+  }
+};
+
+class AffineGridOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Theta"),
+                   "Input(Theta) of AffineGridOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output(Output) of AffineGridOp should not be null.");
+    auto theta_dims = ctx->GetInputDim("Theta");
+    PADDLE_ENFORCE(theta_dims.size() == 3,
+                   "AffineGrid's Input(Theta) should be 3-D tensor.");
+
+    auto output_shape = ctx->Attrs().Get<std::vector<int>>("output_shape");
+    if (output_shape.size() == 0) {
+      PADDLE_ENFORCE(ctx->HasInput("OutputShape"),
+                     "Input(OutputShape) of AffineGridOp should not be null if "
+                     "attr(output_shape) is not configured.");
+      auto output_shape_dims = ctx->GetInputDim("OutputShape");
+      PADDLE_ENFORCE(output_shape_dims.size() == 1,
+                     "AffineGrid's Input(OutputShape) should be 1-D tensor.");
+    } else {
+      PADDLE_ENFORCE(output_shape.size() == 4,
+                     "The size of attr(output_shape) should be 4.");
+    }
+
+    PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2.");
+    PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3.");
+    // N * H * W * 2
+    ctx->SetOutputDim("Output",
+                      framework::make_ddim({theta_dims[0], -1, -1, 2}));
+    ctx->ShareLoD("Theta", "Output");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kCUDNN;
+    }
+#endif
+    auto data_type = framework::ToDataType(ctx.Input<Tensor>("Theta")->type());
+    return framework::OpKernelType(data_type, ctx.GetPlace(),
+                                   framework::DataLayout::kAnyLayout, library);
+  }
+};
+
+class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Theta",
+        "(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. "
+        "It is used to transform coordinate (x_0, y_0) to coordinate (x_1, "
+        "y_1).");
+    AddInput("OutputShape",
+             "(Tensor) The shape of target image with format [N, C, H, W].")
+        .AsDispensable();
+    AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2].");
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default false) Only used in cudnn kernel, need install cudnn")
+        .SetDefault(true);
+    AddAttr<std::vector<int>>(
+        "output_shape",
+        "The target output image shape with format [N, C, H, W].")
+        .SetDefault(std::vector<int>());
+
+    AddComment(R"DOC(
+    It generates a grid of (x,y) coordinates using the parameters of the
+    affine transformation that correspond to a set of points where the input
+    feature map should be sampled to produce the transformed output feature map.
+
+    Given:
+        Theta = [[[x_11, x_12, x_13]
+                  [x_14, x_15, x_16]]
+                 [[x_21, x_22, x_23]
+                  [x_24, x_25, x_26]]]
+    
+        OutputShape = [2, 3, 5, 5]
+
+    Step 1:
+
+        Generate relative coordinates according to OutputShape.
+        The values of relative coordinates are in the interval between -1 and 1.
+        The shape of the relative coordinates is [2, H, W] as below:
+    
+        C = [[[-1.  -1.  -1.  -1.  -1. ]
+              [-0.5 -0.5 -0.5 -0.5 -0.5]
+              [ 0.   0.   0.   0.   0. ]
+              [ 0.5  0.5  0.5  0.5  0.5]
+              [ 1.   1.   1.   1.   1. ]] 
+             [[-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]]]
+        C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
+    
+    Step2:
+        Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
+        C_ = [[-1.  -1.   1. ]
+              [-0.5 -1.   1. ]
+              [ 0.  -1.   1. ]
+              [ 0.5 -1.   1. ]
+              [ 1.  -1.   1. ]
+              [-1.  -0.5  1. ]
+              [-0.5 -0.5  1. ]
+              [ 0.  -0.5  1. ]
+              [ 0.5 -0.5  1. ]
+              [ 1.  -0.5  1. ]
+              [-1.   0.   1. ]
+              [-0.5  0.   1. ]
+              [ 0.   0.   1. ]
+              [ 0.5  0.   1. ]
+              [ 1.   0.   1. ]
+              [-1.   0.5  1. ]
+              [-0.5  0.5  1. ]
+              [ 0.   0.5  1. ]
+              [ 0.5  0.5  1. ]
+              [ 1.   0.5  1. ]
+              [-1.   1.   1. ]
+              [-0.5  1.   1. ]
+              [ 0.   1.   1. ]
+              [ 0.5  1.   1. ]
+              [ 1.   1.   1. ]]
+    Step3:
+        Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
+    )DOC");
+  }
+};
+
+class AffineGridOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto theta_dims = ctx->GetInputDim("Theta");
+    if (ctx->HasOutput(framework::GradVarName("Theta"))) {
+      ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+#endif
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Theta")->type()),
+        ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_);
+  }
+};
+
+class AffineGridGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("affine_grid_grad");
+    op->SetInput("Theta", Input("Theta"));
+    op->SetInput("OutputShape", Input("OutputShape"));
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker,
+                  ops::AffineGridGradMaker);
+REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    affine_grid,
+    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    affine_grid_grad,
+    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+using Array1 = Eigen::DSizes<int64_t, 1>;
+using Array2 = Eigen::DSizes<int64_t, 2>;
+using Array3 = Eigen::DSizes<int64_t, 3>;
+using Array4 = Eigen::DSizes<int64_t, 4>;
+
+/**
+ *Return a tensor with evenly spaced numbers over a specified interval.
+ */
+template <typename DeviceContext, typename T>
+struct Linspace {
+  framework::Tensor operator()(T start, T end, int count,
+                               const framework::ExecutionContext& ctx);
+};
+
+template <typename DeviceContext, typename T>
+class AffineGridOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto* theta = ctx.Input<Tensor>("Theta");
+    int n = theta->dims()[0];
+
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+
+    auto* output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), output,
+        static_cast<T>(0));
+
+    Linspace<DeviceContext, T> linspace;
+    // Get indexes of height with shape [height, width, 1]
+    auto h_idx = linspace((T)-1, (T)1, h, ctx);
+    auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
+    // Get indexes of width with shape [height, width, 1]
+    auto w_idx = linspace((T)-1, (T)1, w, ctx);
+    auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
+    // Get constant ones tensor with shape [height, width, 1]
+    Tensor ones;
+    ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
+    auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
+    // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
+    // ones
+    Tensor grid;
+    grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
+    auto grid_t = EigenTensor<T, 4>::From(grid);
+
+    grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
+                               .broadcast(Array2(h, 1))
+                               .reshape(Array3(h, w, 1))
+                               .concatenate(h_idx_t.reshape(Array2(1, h))
+                                                .broadcast(Array2(w, 1))
+                                                .shuffle(Array2(1, 0))
+                                                .reshape(Array3(h, w, 1)),
+                                            2)
+                               .eval()
+                               .concatenate(ones_t, 2)
+                               .reshape(Array4(1, h, w, 3))
+                               .broadcast(Array4(n, 1, 1, 1));
+
+    // output = grid * theta.T
+    // TODO(wanghaoshuang): Refine batched matrix multiply
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    for (int i = 0; i < n; ++i) {
+      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
+      Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
+      Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2});
+      blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
+                  T(0));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AffineGridGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
+
+    int n = output_grad->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+
+    theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
+
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), theta_grad,
+        static_cast<T>(0));
+
+    Linspace<DeviceContext, T> linspace;
+
+    // Get indexes of height with shape [height, width, 1]
+    auto h_idx = linspace((T)-1, (T)1, h, ctx);
+    auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
+    // Get indexes of width with shape [height, width, 1]
+    auto w_idx = linspace((T)-1, (T)1, w, ctx);
+    auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
+    // Get constant ones tensor with shape [height, width, 1]
+    Tensor ones;
+    ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
+    auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
+    // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
+    // ones
+    Tensor grid;
+    grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
+    auto grid_t = EigenTensor<T, 4>::From(grid);
+    grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
+                               .broadcast(Array2(h, 1))
+                               .reshape(Array3(h, w, 1))
+                               .concatenate(h_idx_t.reshape(Array2(1, h))
+                                                .broadcast(Array2(w, 1))
+                                                .shuffle(Array2(1, 0))
+                                                .reshape(Array3(h, w, 1)),
+                                            2)
+                               .eval()
+                               .concatenate(ones_t, 2)
+                               .reshape(Array4(1, h, w, 3))
+                               .broadcast(Array4(n, 1, 1, 1));
+    // output = grid * theta.T
+    // TODO(wanghaoshuang): Refine batched matrix multiply
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    for (int i = 0; i < n; ++i) {
+      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
+      Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2});
+      Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
+      blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
+                  &sliced_theta_grad, T(0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/delete_var_op.cc
+++ b/paddle/fluid/operators/delete_var_op.cc
@@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase {
  }
 };

+class DeleteVarOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {}
+};
+
 class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
@@ -48,4 +53,5 @@ It should not be configured by users directly.

 REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp,
                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::DeleteVarOpInfoMaker);
+                  paddle::operators::DeleteVarOpInfoMaker,
+                  paddle::operators::DeleteVarOpShapeInference);
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -341,6 +341,28 @@ class ScopedPoolingDescriptor {
  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
 };

+class ScopedSpatialTransformerDescriptor {
+ public:
+  ScopedSpatialTransformerDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
+  }
+  ~ScopedSpatialTransformerDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
+  }
+
+  template <typename T>
+  inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims,
+                                                        const int dimA[]) {
+    PADDLE_ENFORCE(dynload::cudnnSetSpatialTransformerNdDescriptor(
+        desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType<T>::type, nbDims, dimA));
+    return desc_;
+  }
+
+ private:
+  cudnnSpatialTransformerDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedSpatialTransformerDescriptor);
+};
+
 inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -65,44 +65,51 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
 * include all needed cudnn functions in HPPL
 * different cudnn version has different interfaces
 **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)             \
-  __macro(cudnnSetTensor4dDescriptor);              \
-  __macro(cudnnSetTensor4dDescriptorEx);            \
-  __macro(cudnnSetTensorNdDescriptor);              \
-  __macro(cudnnGetTensorNdDescriptor);              \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);   \
-  __macro(cudnnGetConvolutionForwardAlgorithm);     \
-  __macro(cudnnCreateTensorDescriptor);             \
-  __macro(cudnnDestroyTensorDescriptor);            \
-  __macro(cudnnCreateFilterDescriptor);             \
-  __macro(cudnnSetFilter4dDescriptor);              \
-  __macro(cudnnSetFilterNdDescriptor);              \
-  __macro(cudnnGetFilterNdDescriptor);              \
-  __macro(cudnnSetPooling2dDescriptor);             \
-  __macro(cudnnSetPoolingNdDescriptor);             \
-  __macro(cudnnGetPoolingNdDescriptor);             \
-  __macro(cudnnDestroyFilterDescriptor);            \
-  __macro(cudnnCreateConvolutionDescriptor);        \
-  __macro(cudnnCreatePoolingDescriptor);            \
-  __macro(cudnnDestroyPoolingDescriptor);           \
-  __macro(cudnnSetConvolution2dDescriptor);         \
-  __macro(cudnnDestroyConvolutionDescriptor);       \
-  __macro(cudnnSetConvolutionNdDescriptor);         \
-  __macro(cudnnGetConvolutionNdDescriptor);         \
-  __macro(cudnnDeriveBNTensorDescriptor);           \
-  __macro(cudnnCreate);                             \
-  __macro(cudnnDestroy);                            \
-  __macro(cudnnSetStream);                          \
-  __macro(cudnnActivationForward);                  \
-  __macro(cudnnConvolutionForward);                 \
-  __macro(cudnnConvolutionBackwardBias);            \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize); \
-  __macro(cudnnTransformTensor);                    \
-  __macro(cudnnPoolingForward);                     \
-  __macro(cudnnPoolingBackward);                    \
-  __macro(cudnnSoftmaxBackward);                    \
-  __macro(cudnnSoftmaxForward);                     \
-  __macro(cudnnGetVersion);                         \
+#define CUDNN_DNN_ROUTINE_EACH(__macro)              \
+  __macro(cudnnSetTensor4dDescriptor);               \
+  __macro(cudnnSetTensor4dDescriptorEx);             \
+  __macro(cudnnSetTensorNdDescriptor);               \
+  __macro(cudnnGetTensorNdDescriptor);               \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);    \
+  __macro(cudnnGetConvolutionForwardAlgorithm);      \
+  __macro(cudnnCreateTensorDescriptor);              \
+  __macro(cudnnDestroyTensorDescriptor);             \
+  __macro(cudnnCreateFilterDescriptor);              \
+  __macro(cudnnSetFilter4dDescriptor);               \
+  __macro(cudnnSetFilterNdDescriptor);               \
+  __macro(cudnnGetFilterNdDescriptor);               \
+  __macro(cudnnSetPooling2dDescriptor);              \
+  __macro(cudnnSetPoolingNdDescriptor);              \
+  __macro(cudnnGetPoolingNdDescriptor);              \
+  __macro(cudnnDestroyFilterDescriptor);             \
+  __macro(cudnnCreateConvolutionDescriptor);         \
+  __macro(cudnnCreatePoolingDescriptor);             \
+  __macro(cudnnDestroyPoolingDescriptor);            \
+  __macro(cudnnSetConvolution2dDescriptor);          \
+  __macro(cudnnDestroyConvolutionDescriptor);        \
+  __macro(cudnnSetConvolutionNdDescriptor);          \
+  __macro(cudnnGetConvolutionNdDescriptor);          \
+  __macro(cudnnDeriveBNTensorDescriptor);            \
+  __macro(cudnnCreateSpatialTransformerDescriptor);  \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);   \
+  __macro(cudnnDestroySpatialTransformerDescriptor); \
+  __macro(cudnnSpatialTfGridGeneratorForward);       \
+  __macro(cudnnSpatialTfGridGeneratorBackward);      \
+  __macro(cudnnSpatialTfSamplerForward);             \
+  __macro(cudnnSpatialTfSamplerBackward);            \
+  __macro(cudnnCreate);                              \
+  __macro(cudnnDestroy);                             \
+  __macro(cudnnSetStream);                           \
+  __macro(cudnnActivationForward);                   \
+  __macro(cudnnConvolutionForward);                  \
+  __macro(cudnnConvolutionBackwardBias);             \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);  \
+  __macro(cudnnTransformTensor);                     \
+  __macro(cudnnPoolingForward);                      \
+  __macro(cudnnPoolingBackward);                     \
+  __macro(cudnnSoftmaxBackward);                     \
+  __macro(cudnnSoftmaxForward);                      \
+  __macro(cudnnGetVersion);                          \
  __macro(cudnnGetErrorString);
 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)


--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -884,12 +884,13 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):

    load_prog = Program()
    load_block = load_prog.global_block()
+    need_delete_vars = []

    for var_tuple in slice_vars_and_attrs:
        orig_var = var_tuple[0]
        start = var_tuple[1]
        slice_var = var_tuple[2]
-        end = start + reduce(lambda x, y: x * y, slice_var.shape)
+        end = start + slice_var.shape[0]

        clone_orig_var = load_block.create_var(
            name=orig_var.name,
@@ -917,5 +918,8 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
            attrs={'axes': [0],
                   'starts': [start],
                   'ends': [end]})
-
+        need_delete_vars.append(clone_orig_var)
+    load_block.append_op(
+        type='delete_var',
+        inputs={'X': need_delete_vars}, )
    executor.run(load_prog)
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -154,6 +154,7 @@ __all__ = [
    'mul',
    'sigmoid_cross_entropy_with_logits',
    'maxout',
+    'affine_grid',
    'sequence_reverse',
    'affine_channel',
    'hash',
@@ -710,8 +711,18 @@ def dynamic_gru(input,
              The first part are weights of the update gate and reset gate with
              shape :math:`(D \\times 2D)`, and the second part are weights for
              candidate hidden state with shape :math:`(D \\times D)`.
-        bias_attr(ParamAttr): The parameter attribute for learnable the
-            hidden-hidden bias.
+
+            If it is set to None or one attribute of ParamAttr, dynamic_gru will
+            create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
+            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates 
+            the bias in the update gate, reset gate and candidate calculations.
+            If it is set to False, no bias will be applied to the update gate, 
+            reset gate and candidate calculations. If it is set to None or one 
+            attribute of ParamAttr, dynamic_gru will create ParamAttr as 
+            bias_attr. If the Initializer of the bias_attr is not set, the bias
+            is initialized zero. Default: None.
        is_reverse(bool): Whether to compute reversed GRU, default
            :attr:`False`.
        gate_activation(str): The activation for update gate and reset gate.
@@ -810,10 +821,29 @@ def gru_unit(input,

    Args:
        input (Variable): The fc transformed input value of current step.
-        hidden (Variable): The hidden value of lstm unit from previous step.
+        hidden (Variable): The hidden value of gru unit from previous step.
        size (integer): The input dimension value.
-        param_attr (ParamAttr): The weight parameters for gru unit. Default: None
-        bias_attr (ParamAttr): The bias parameters for gru unit. Default: None
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            hidden-hidden weight matrix. Note:
+
+            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
+              :math:`D` is the hidden size.
+            - All elements in the weight matrix can be divided into two parts.
+              The first part are weights of the update gate and reset gate with
+              shape :math:`(D \\times 2D)`, and the second part are weights for
+              candidate hidden state with shape :math:`(D \\times D)`.
+
+            If it is set to None or one attribute of ParamAttr, gru_unit will
+            create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
+            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates 
+            the bias in the update gate, reset gate and candidate calculations.
+            If it is set to False, no bias will be applied to the update gate, 
+            reset gate and candidate calculations. If it is set to None or one 
+            attribute of ParamAttr, gru_unit will create ParamAttr as 
+            bias_attr. If the Initializer of the bias_attr is not set, the bias
+            is initialized zero. Default: None.
        activation (string): The activation type for cell (actNode).
                             Default: 'tanh'
        gate_activation (string): The activation type for gates (actGate).
@@ -4443,7 +4473,10 @@ def transpose(x, perm, name=None):
    Examples:
        .. code-block:: python

-            x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32')
+            # use append_batch_size=False to avoid prepending extra 
+            # batch size in shape
+            x = fluid.layers.data(name='x', shape=[5, 10, 15], 
+                            dtype='float32', append_batch_size=False)
            x_transposed = layers.transpose(x, perm=[1, 0, 2])
    """

@@ -6108,6 +6141,124 @@ def crop(x, shape=None, offsets=None, name=None):
    return out


+def affine_grid(theta, out_shape, name=None):
+    """
+    It generates a grid of (x,y) coordinates using the parameters of
+    the affine transformation that correspond to a set of points where
+    the input feature map should be sampled to produce the transformed
+    output feature map.
+
+    .. code-block:: text
+
+        * Case 1:
+
+          Given:
+
+              theta = [[[x_11, x_12, x_13]
+                        [x_14, x_15, x_16]]
+                       [[x_21, x_22, x_23]
+                        [x_24, x_25, x_26]]]
+      
+              out_shape = [2, 3, 5, 5]
+      
+          Step 1:
+      
+              Generate normalized coordinates according to out_shape.
+              The values of the normalized coordinates are in the interval between -1 and 1.
+              The shape of the normalized coordinates is [2, H, W] as below:
+      
+              C = [[[-1.  -1.  -1.  -1.  -1. ]
+                    [-0.5 -0.5 -0.5 -0.5 -0.5]
+                    [ 0.   0.   0.   0.   0. ]
+                    [ 0.5  0.5  0.5  0.5  0.5]
+                    [ 1.   1.   1.   1.   1. ]]
+                   [[-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]]]
+              C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
+
+          Step2:
+
+              Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
+              C_ = [[-1.  -1.   1. ]
+                    [-0.5 -1.   1. ]
+                    [ 0.  -1.   1. ]
+                    [ 0.5 -1.   1. ]
+                    [ 1.  -1.   1. ]
+                    [-1.  -0.5  1. ]
+                    [-0.5 -0.5  1. ]
+                    [ 0.  -0.5  1. ]
+                    [ 0.5 -0.5  1. ]
+                    [ 1.  -0.5  1. ]
+                    [-1.   0.   1. ]
+                    [-0.5  0.   1. ]
+                    [ 0.   0.   1. ]
+                    [ 0.5  0.   1. ]
+                    [ 1.   0.   1. ]
+                    [-1.   0.5  1. ]
+                    [-0.5  0.5  1. ]
+                    [ 0.   0.5  1. ]
+                    [ 0.5  0.5  1. ]
+                    [ 1.   0.5  1. ]
+                    [-1.   1.   1. ]
+                    [-0.5  1.   1. ]
+                    [ 0.   1.   1. ]
+                    [ 0.5  1.   1. ]
+                    [ 1.   1.   1. ]]
+          Step3:
+              Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
+
+    Args:
+        theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
+        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
+        out_shape can be a Variable or a list or tuple.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The output with shape [N, H, W, 2].
+
+    Raises:
+        ValueError: If the type of arguments is not supported.
+
+    Examples:
+
+        .. code-block:: python
+            theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
+            out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
+            data = fluid.layers.affine_grid(theta, out_shape)
+
+            # or
+            data = fluid.layers.affine_grid(theta, [5, 3, 28, 28])
+
+    """
+    helper = LayerHelper('affine_grid')
+
+    if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
+        isinstance(out_shape, Variable)):
+        raise ValueError("The out_shape should be a list, tuple or Variable.")
+
+    if not isinstance(theta, Variable):
+        raise ValueError("The theta should be a Variable.")
+
+    out = helper.create_variable_for_type_inference(theta.dtype)
+    ipts = {'Theta': theta}
+    attrs = {}
+    if isinstance(out_shape, Variable):
+        ipts['OutputShape'] = out_shape
+    else:
+        attrs['output_shape'] = out_shape
+
+    helper.append_op(
+        type='affine_grid',
+        inputs=ipts,
+        outputs={'Output': out},
+        attrs=None if len(attrs) == 0 else attrs)
+    return out
+
+
 def rank_loss(label, left, right, name=None):
    """
    **Rank loss layer for RankNet**

--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import signal
+import subprocess
+import argparse
+import time
+import math
+import random
+from multiprocessing import Process
+from functools import reduce
+
+import numpy as np
+import unittest
+import six
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid import io
+
+from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP
+from dist_simnet_bow import TestDistSimnetBow2x2, DATA_URL, DATA_MD5
+
+
+class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
+    def _load_persistable_vars(self, executor, dirname, program):
+        def _is_checkpoint_var(var):
+            """
+            the checkpoint will not save or load all the variables.
+            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+            : param var(Variable)
+            """
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                    var.desc.type() == core.VarDesc.VarType.RAW:
+                return False
+            # @GRAD are named for gradient variables, checkpoint will not save it.
+            if "@GRAD" in var.name:
+                return False
+            # .trainer_ are named for distribute train variables, checkpoint will not save it.
+            if ".trainer_" in var.name:
+                return False
+
+            # .block is named for distribute train variables, checkpoint will not save it.
+            if ".block" in var.name:
+                return False
+
+            if "tmp_" in var.name:
+                return False
+
+            return var.persistable
+
+        io.load_vars(
+            executor,
+            dirname=dirname,
+            main_program=program,
+            predicate=_is_checkpoint_var,
+            filename=None)
+
+    def run_pserver(self, args):
+        self.get_model(batch_size=2)
+        # NOTE: pserver should not call memory optimize
+        t = self.get_transpiler(args.trainer_id,
+                                fluid.default_main_program(), args.endpoints,
+                                args.trainers, args.sync_mode)
+        pserver_prog = t.get_pserver_program(args.current_endpoint)
+        startup_prog = t.get_startup_program(args.current_endpoint,
+                                             pserver_prog)
+
+        need_load = bool(int(os.getenv("LOAD", "0")))
+        model_dir = os.getenv("MODEL_DIR", "")
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+
+        if need_load and model_dir:
+            self._load_persistable_vars(exe, model_dir, startup_prog)
+        exe.run(pserver_prog)
+
+    def run_trainer(self, args):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
+            self.get_model(batch_size=2)
+
+        if args.mem_opt:
+            fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
+        if args.is_dist:
+            t = self.get_transpiler(args.trainer_id,
+                                    fluid.default_main_program(),
+                                    args.endpoints, args.trainers,
+                                    args.sync_mode)
+
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        if args.use_cuda:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+
+        build_stra = fluid.BuildStrategy()
+
+        if args.use_reduce:
+            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        else:
+            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+
+        exe = fluid.ParallelExecutor(
+            args.use_cuda,
+            loss_name=avg_cost.name,
+            exec_strategy=strategy,
+            build_strategy=build_stra)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.values()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = train_reader()
+
+        def get_data():
+            origin_batch = next(reader_generator)
+            if args.is_dist and args.use_reader_alloc:
+                new_batch = []
+                for offset, item in enumerate(origin_batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
+            else:
+                return origin_batch
+
+        need_save = bool(int(os.getenv("SAVE", "0")))
+        model_dir = os.getenv("MODEL_DIR", "")
+
+        if need_save:
+            for _ in six.moves.xrange(RUN_STEP):
+                loss, = exe.run(fetch_list=[avg_cost.name],
+                                feed=feeder.feed(get_data()))
+            if need_save and model_dir:
+                io.save_persistables(startup_exe, model_dir, trainer_prog)
+
+        var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor())
+        print(np.ravel(var).tolist())
+
+
+if __name__ == "__main__":
+    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
+    runtime_main(TestDistSaveLoad2x2)
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def AffineGrid(theta, size):
+    n = size[0]
+    w = size[3]
+    h = size[2]
+    h_idx = np.repeat(
+        np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
+    w_idx = np.repeat(
+        np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
+    grid = np.concatenate(
+        [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
+    grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
+
+    ret = np.zeros([n, h * w, 2])
+    theta = theta.transpose([0, 2, 1])
+    for i in range(len(theta)):
+        ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i])
+
+#    print ret.reshape([h * w, 2]).astype("float32")    
+    return ret.reshape([n, h, w, 2]).astype("float32")
+
+
+class TestAffineGridOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "affine_grid"
+        theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
+        theta = np.ones(self.theta_shape).astype("float32")
+        self.inputs = {'Theta': theta}
+        self.attrs = {"use_cudnn": True}
+        if self.dynamic_shape:
+            self.inputs['OutputShape'] = self.output_shape
+        else:
+            self.attrs['output_shape'] = self.output_shape
+        self.outputs = {'Output': AffineGrid(theta, self.output_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(
+            ['Theta'],
+            'Output',
+            no_grad_set=['OutputShape'],
+            max_relative_error=0.006)
+
+    def initTestCase(self):
+        self.theta_shape = (3, 2, 3)
+        self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = False
+
+
+class TestAffineGridOpCase1(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (3, 2, 3)
+        self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = True
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import os
+import shutil
+import unittest
+import tempfile
+
+import numpy as np
+
+from test_dist_base import TestDistBase, RUN_STEP
+
+
+class TestDistSaveLoadDense2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "http_proxy": ""
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        model_dir = tempfile.mkdtemp()
+
+        local_env = {}
+        local_env["SAVE"] = "1"
+        local_env["MODEL_DIR"] = model_dir
+        local_env.update(required_envs)
+
+        cluster_env = {}
+        cluster_env["LOAD"] = "1"
+        cluster_env["MODEL_DIR"] = model_dir
+        cluster_env.update(required_envs)
+
+        local_var = self._run_local(model_file, local_env, check_error_log)
+        tr0_var, tr1_var = self._run_cluster(model_file, cluster_env,
+                                             check_error_log)
+
+        shutil.rmtree(model_dir)
+
+        local_np = np.array(eval(local_var[0]))
+        train0_np = np.array(eval(tr0_var[0]))
+        train1_np = np.array(eval(tr1_var[0]))
+        self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta)
+        self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta)
+        self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta)
+
+    @unittest.skip(reason="CI fail")
+    def test_dist(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
+        self.check_with_place(
+            "dist_save_load.py",
+            delta=0,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -865,6 +865,22 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(out)
        print(str(program))

+    def test_affine_grid(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='data', shape=[2, 3, 3], dtype="float32")
+            out, ids = layers.argsort(input=data, axis=1)
+
+            theta = layers.data(name="theta", shape=[2, 3], dtype="float32")
+            out_shape = layers.data(
+                name="out_shape", shape=[-1], dtype="float32")
+            data_0 = layers.affine_grid(theta, out_shape)
+            data_1 = layers.affine_grid(theta, [5, 3, 28, 28])
+
+            self.assertIsNotNone(data_0)
+            self.assertIsNotNone(data_1)
+        print(str(program))
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+from threading import Thread
+
+
+def user_reader(inputs):
+    def _reader():
+        for d in inputs:
+            yield d
+
+    return _reader
+
+
+def batch_feeder(batch_reader, pin_memory=False, img_dtype="float32"):
+    def _feeder():
+        for batch_data in batch_reader():
+            sample_batch = []
+            label_batch = []
+            for sample, label in batch_data:
+                sample_batch.append(sample)
+                label_batch.append([label])
+            tensor = core.LoDTensor()
+            label = core.LoDTensor()
+            place = core.CUDAPinnedPlace() if pin_memory else core.CPUPlace()
+            tensor.set(np.array(sample_batch, dtype=img_dtype), place)
+            label.set(np.array(label_batch, dtype="int64"), place)
+            yield [tensor, label]
+
+    return _feeder
+
+
+class TestPyReader(unittest.TestCase):
+    def setUp(self):
+        self.capacity = 10
+        self.shapes = [(-1, 3, 2, 1), (-1, 1)]
+        self.lod_levels = [0, 0]
+        self.dtypes = ['float32', 'int64']
+
+    def test_pin_memory_pyreader(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            executor = fluid.Executor(place)
+
+            data_file = fluid.layers.py_reader(
+                capacity=self.capacity,
+                dtypes=self.dtypes,
+                lod_levels=self.lod_levels,
+                shapes=self.shapes)
+            # feed_queue = data_file.queue
+            read_out_data = fluid.layers.read_file(data_file)
+
+            self.inputs = []
+            for _ in range(10):
+                sample = np.random.uniform(
+                    low=0, high=1, size=[3, 2, 1]).astype("float32")
+                label = np.random.uniform(
+                    low=0, high=10, size=[1]).astype("int64")
+                self.inputs.append((sample, label))
+
+            self.input_tensors = []
+            for d, l in batch_feeder(
+                    paddle.batch(
+                        user_reader(self.inputs), batch_size=2),
+                    pin_memory=True
+                    if fluid.core.is_compiled_with_cuda() else False)():
+                ta = fluid.LoDTensorArray()
+                ta.append(d)
+                ta.append(l)
+                self.input_tensors.append(ta)
+
+            self.batched_inputs = []
+            for batch in paddle.batch(user_reader(self.inputs), batch_size=2)():
+                feed_d = []
+                feed_l = []
+                for d, l in batch:
+                    feed_d.append(d)
+                    feed_l.append([l])
+                self.batched_inputs.append([feed_d, feed_l])
+
+            data_file.decorate_tensor_provider(
+                batch_feeder(
+                    paddle.batch(
+                        user_reader(self.inputs), batch_size=2),
+                    pin_memory=True
+                    if fluid.core.is_compiled_with_cuda() else False))
+
+            executor.run(fluid.default_startup_program())
+            self.outputs = []
+
+            data_file.start()
+            for _ in self.input_tensors:
+                self.outputs.append(
+                    executor.run(fetch_list=list(read_out_data)))
+            data_file.reset()
+            self.validate()
+
+    def validate(self):
+        self.assertEqual(len(self.batched_inputs), len(self.outputs))
+        for in_data_list, out_data_list in zip(self.batched_inputs,
+                                               self.outputs):
+            self.assertEqual(len(in_data_list), len(out_data_list))
+            in_data_list_np = [
+                np.array(in_lod_tensor) for in_lod_tensor in in_data_list
+            ]
+            for in_data, out_data in zip(in_data_list_np, out_data_list):
+                self.assertTrue((in_data == out_data).all())
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -920,11 +920,11 @@ to transpile() call.")
            block_idx = int(block_name.split(block_suffix)[1])
            orig_var = self.origin_program.global_block().vars[orig_var_name]

-            skip_numel = 0
+            skip_dim0 = 0
            slice_vars = self.param_var_mapping[orig_var_name]
            for slice_var in slice_vars[:block_idx]:
-                skip_numel += reduce(lambda x, y: x * y, slice_var.shape)
-            slice_vars_and_attrs.append([orig_var, skip_numel, param])
+                skip_dim0 += slice_var.shape[0]
+            slice_vars_and_attrs.append([orig_var, skip_dim0, param])

        return slice_vars_and_attrs