Merge remote-tracking branch 'ups/develop' into refine/seqpool/feed

ce909664 · tensor-tang · a0a27bd2 · f2afb070 · ce909664 · ce909664
19 changed file
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,14 +37,18 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_GIT_TAG         "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9")
+SET(NGRAPH_GIT_TAG         "20bd8bbc79ae3a81c57313846a2be7313e5d1dab")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
 SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
 SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
-SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    SET(NGRAPH_TBB_LIB_NAME    libtbb_debug.so.2)
+else()
+    SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
+endif()
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
 SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
 SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
@@ -66,16 +70,7 @@ ExternalProject_Add(
    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
    CMAKE_ARGS          -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
-)
+    CMAKE_ARGS          -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
-# Workaround for nGraph expecting mklml to be in mkldnn install directory.
-ExternalProject_Add_Step(
-    ${NGRAPH_PROJECT}
-    PrepareMKL
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so
-    DEPENDEES download
-    DEPENDERS configure
 )
 add_dependencies(ngraph ${NGRAPH_PROJECT})

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -226,7 +226,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
   * Only variables should be the leaves of graph.
   */
  AddOutputToLeafOps(&result);
-  result.Erase<GraphOps>(kGraphOps);
+  result.Erase(kGraphOps);
  return graph;
 }

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -109,7 +109,6 @@ class Graph {
    attr_dels_[attr_name] = []() {};
  }
-  template <typename AttrType>
  void Erase(const std::string &attr_name) {
    PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph",
                   attr_name);

--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -116,6 +116,10 @@ D
      --modeldir=$DATA_DIR/mobilenet/model \
      --data=$DATA_DIR/mobilenet/data.txt \
      --refer=$DATA_DIR/mobilenet/result.txt 
+    if [ $? -ne 0 ]; then
+      echo "trt demo trt_mobilenet_demo runs fail."
+      exit 1
+    fi
  fi
 done
 set +x
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -38,8 +38,8 @@ void Main() {
  std::unique_ptr<PaddlePredictor> predictor;
  paddle::contrib::AnalysisConfig config;
  config.EnableUseGpu(100, 0);
-  config.SetModel(FLAGS_modeldir + "/__params__",
+  config.SetModel(FLAGS_modeldir + "/__model__",
-                  FLAGS_modeldir + "/__model__");
+                  FLAGS_modeldir + "/__params__");
  config.EnableTensorRtEngine();
  predictor = CreatePaddlePredictor(config);

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -283,7 +283,7 @@ TEST(Analyzer_rnn1, multi_thread) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, 4 /* multi_thread */);
+                 input_slots_all, &outputs, 2 /* multi_thread */);
 }
 // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -195,6 +195,10 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::SelectedRows& input1,
                  framework::Tensor* input2) {
+    if (UNLIKELY(input1.rows().size() == 0)) {
+      LOG(WARNING) << "input selected rows is empty!";
+      return;
+    }
    auto in1_height = input1.height();
    auto in2_dims = input2->dims();
    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -41,7 +41,9 @@ class SumOp : public framework::OperatorWithKernel {
      return;  // skip runtime infershape when is tensor array;
    }
+    auto x_var_types = ctx->GetInputsVarType("X");
    auto x_dims = ctx->GetInputsDim("X");
    size_t N = x_dims.size();
    PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
    if (N == 1) {
@@ -49,7 +51,13 @@ class SumOp : public framework::OperatorWithKernel {
    }
    framework::DDim in_dim({0});
-    for (auto& x_dim : x_dims) {
+    for (size_t i = 0; i < x_dims.size(); ++i) {
+      auto& x_dim = x_dims[i];
+      // x_dim.size() == 1 means the real dim of selected rows is [0]
+      if (x_var_types[i] == framework::proto::VarType::SELECTED_ROWS &&
+          x_dim.size() == 1) {
+        continue;
+      }
      if (framework::product(x_dim) == 0) {
        continue;
      }

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -3,7 +3,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune fe
 if(WITH_PYTHON)
  list(APPEND PYBIND_DEPS py_func_op)
 endif()
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc)
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)

--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/pybind/ir.h"
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "pybind11/stl.h"
+namespace py = pybind11;
+using paddle::framework::ir::Graph;
+using paddle::framework::ir::Node;
+using paddle::framework::OpDesc;
+using paddle::framework::ProgramDesc;
+using paddle::framework::VarDesc;
+using pybind11::return_value_policy;
+namespace paddle {
+namespace pybind {
+void BindGraph(py::module *m) {
+  py::class_<Graph, std::shared_ptr<Graph>>(
+      *m, "Graph",
+      "The graph is a Directed Acyclic Single Static Assignment Graph, see "
+      "`paddle::ir::Graph` for details.")
+      .def(py::init<const ProgramDesc &>())
+      .def("has", &Graph::Has)
+      .def("get_int", &Graph::Get<int>)
+      .def("get_float", &Graph::Get<float>)
+      .def("get_double", &Graph::Get<double>)
+      .def("get_string", &Graph::Get<std::string>)
+      .def("set", [](Graph &self, const std::string &attr_name,
+                     int attr) { return self.Set(attr_name, new int(attr)); })
+      .def("set",
+           [](Graph &self, const std::string &attr_name,
+              const std::string &attr) {
+             return self.Set(attr_name, new std::string(attr));
+           })
+      .def("set",
+           [](Graph &self, const std::string &attr_name, float attr) {
+             return self.Set(attr_name, new float(attr));
+           })
+      .def("set",
+           [](Graph &self, const std::string &attr_name, double attr) {
+             return self.Set(attr_name, new double(attr));
+           })
+      .def("erase", &Graph::Erase)
+      .def("nodes", &Graph::Nodes, return_value_policy::reference)
+      .def("create_var_node",
+           [](Graph &self, VarDesc &var_desc) {
+             return self.CreateVarNode(&var_desc);
+           },
+           return_value_policy::reference)
+      .def("create_op_node",
+           [](Graph &self, OpDesc &op_desc) {
+             return self.CreateOpNode(&op_desc);
+           },
+           return_value_policy::reference)
+      .def("create_control_dep_var", &Graph::CreateControlDepVar,
+           return_value_policy::reference)
+      .def("create_empty_node", &Graph::CreateEmptyNode,
+           return_value_policy::reference)
+      .def("release_nodes", &Graph::ReleaseNodes)
+      .def("remove_node",
+           [](Graph &self, Node &node) { return self.RemoveNode(&node); })
+      .def("retrieve_node", &Graph::RetrieveNode,
+           return_value_policy::reference)
+      .def("resolve_hazard", &Graph::ResolveHazard);
+}
+void BindNode(py::module *m) {
+  py::class_<Node> node(*m, "Node");
+  node.def("name", &Node::Name)
+      .def("node_type", &Node::NodeType)
+      .def("var", &Node::Var)
+      .def("op", &Node::Op)
+      .def("id", &Node::id)
+      .def("is_op", &Node::IsOp)
+      .def("is_var", &Node::IsVar)
+      .def("is_ctrl_var", &Node::IsCtrlVar)
+      .def_readwrite("inputs", &Node::inputs)
+      .def_readwrite("outputs", &Node::outputs);
+  py::enum_<Node::Type>(node, "Type")
+      .value("Operation", Node::Type::kOperation)
+      .value("Variable", Node::Type::kVariable)
+      .export_values();
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/ir.h
+++ b/paddle/fluid/pybind/ir.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <pybind11/pybind11.h>
+#include "paddle/fluid/framework/ir/graph.h"
+namespace paddle {
+namespace pybind {
+void BindGraph(pybind11::module *m);
+void BindNode(pybind11::module *m);
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -49,6 +49,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
@@ -775,7 +776,12 @@ All parameter, weight, gradient are variables in Paddle.
          })
      .def("set_int", [](ir::Pass &self, const std::string &name,
                         int val) { self.Set<const int>(name, new int(val)); })
-      .def("type", &ir::Pass::Type);
+      .def("type", &ir::Pass::Type)
+      .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
+        std::unique_ptr<ir::Graph> origin_graph(graph.get());
+        auto optim_graph = self.Apply(std::move(origin_graph));
+        graph.reset(optim_graph.release());
+      });
  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
      m, "PassBuilder");
@@ -1042,6 +1048,9 @@ All parameter, weight, gradient are variables in Paddle.
  BindRecordIOWriter(&m);
  BindAsyncExecutor(&m);
+  BindGraph(&m);
+  BindNode(&m);
 }
 }  // namespace pybind
 }  // namespace paddle
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -71,10 +71,25 @@ class DataToLoDTensorConverter(object):
            for each_data in data:
                self._feed_impl_(each_data, lod[1:], lod_level - 1)
+    def _check_shape(self, shape):
+        for s1, s2 in zip(self.shape, shape):
+            if s1 != s2 and s1 >= 0 and s2 >= 0:
+                raise ValueError(
+                    "Shape not match. What is defined in data layer is {}, but receive {}".
+                    format(self.shape, shape))
    def done(self):
        arr = numpy.array(self.data, dtype=self.dtype)
-        if self.shape and len(arr.shape) != len(self.shape):
+        if self.shape:
-            arr = arr.reshape(self.shape)
+            if len(arr.shape) != len(self.shape):
+                try:
+                    arr = arr.reshape(self.shape)
+                except ValueError:
+                    raise ValueError(
+                        "Reshape error. What is defined in data layer is {}, but receive {}"
+                        .format(self.shape, arr.shape))
+            else:
+                self._check_shape(arr.shape)
        t = core.LoDTensor()
        t.set(arr, self.place)
        if self.lod_level > 0:
@@ -152,17 +167,8 @@ class DataFeeder(object):
                raise TypeError("Feed list should contain a list of variable")
            self.feed_dtypes.append(each_var.dtype)
            self.feed_names.append(each_var.name)
-            shape = each_var.shape
-            batch_size_dim = -1
-            for i, s in enumerate(shape):
-                if s < 0:
-                    batch_size_dim = i
-                    break
-            if batch_size_dim == -1:
-                raise ValueError("Variable {0} must has a batch size dimension",
-                                 each_var.name)
            self.feed_lod_level.append(each_var.lod_level)
-            self.feed_shapes.append(shape)
+            self.feed_shapes.append(each_var.shape)
        self.place = place

--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -30,6 +30,12 @@ class TestDataFeeder(unittest.TestCase):
        self.assertEqual(result['image'].recursive_sequence_lengths(), [])
        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
+        try:
+            result = feeder.feed([([0] * 783, [9]), ([1] * 783, [1])])
+            self.assertTrue(False)
+        except ValueError:
+            self.assertTrue(True)
    def test_lod_level_1_converter(self):
        # lod_level = 1
        # each sentence has a different number of words

--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -31,6 +31,7 @@ fluid.default_main_program().random_seed = 1
 class TestDistCTR2x2(TestDistRunnerBase):
    def get_model(self, batch_size=2):
        dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta()
        """ network definition """
        dnn_data = fluid.layers.data(
@@ -97,7 +98,14 @@ class TestDistCTR2x2(TestDistRunnerBase):
        inference_program = paddle.fluid.default_main_program().clone()
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+        regularization = None
+        use_l2_decay = bool(os.getenv('USE_L2_DECAY', 0))
+        if use_l2_decay:
+            regularization = fluid.regularizer.L2DecayRegularizer(
+                regularization_coeff=1e-1)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001,
+                                            regularization=regularization)
        sgd_optimizer.minimize(avg_cost)
        dataset = dist_ctr_reader.Dataset()

--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -235,7 +235,6 @@ class DistSeResneXt2x2(TestDistRunnerBase):
        bd = [step * e for e in epochs]
        base_lr = 0.1
-        lr = []
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
        optimizer = fluid.optimizer.Momentum(

--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
@@ -18,7 +18,6 @@ import unittest
 from test_dist_base import TestDistBase
-# FIXME(tangwei): sum op can not handle when inputs is empty.
 class TestDistCTR2x2(TestDistBase):
    def _setup_config(self):
        self._sync_mode = True
@@ -28,5 +27,19 @@ class TestDistCTR2x2(TestDistBase):
        self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
+class TestDistCTRWithL2Decay2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+    def test_dist_ctr(self):
+        need_envs = {"USE_L2_DECAY": "1"}
+        self.check_with_place(
+            "dist_ctr.py",
+            delta=1e-7,
+            check_error_log=False,
+            need_envs=need_envs)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_ir_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_graph.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import six
+from paddle import fluid
+class TestIRGraph(unittest.TestCase):
+    """
+    TODO(fc500110): `resolve_hazard` api will be tested when it can be used.
+    """
+    def test_nodes(self):
+        graph = build_graph()
+        self.assertTrue(
+            {node.name()
+             for node in graph.nodes()} == {"x1", "x2", "out", "sum"})
+    def test_has_set_get(self):
+        graph = build_graph()
+        for attr_name in ["int", "float", "string"]:
+            self.assertFalse(graph.has(attr_name))
+        graph.set("int", 1)
+        graph.set("float", 0.5)
+        graph.set("string", "string")
+        for attr_name in ["int", "float", "string"]:
+            self.assertTrue(graph.has(attr_name))
+        self.assertTrue(graph.get_int("int") == 1)
+        self.assertTrue(graph.get_float("float") == 0.5)
+        self.assertTrue(graph.get_string("string") == "string")
+    def test_erase(self):
+        graph = build_graph()
+        graph.set("test", 0)
+        self.assertTrue(graph.has("test"))
+        graph.erase("test")
+        self.assertFalse(graph.has("test"))
+    def test_create_var_node(self):
+        prog = fluid.core.ProgramDesc()
+        block = prog.block(0)
+        shape = [10, 20]
+        x1 = block.var(six.b("x1"))
+        x1.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
+        x1.set_shape(shape)
+        graph = fluid.core.Graph(prog)
+        node = graph.create_var_node(x1)
+        self.assertTrue(node.node_type() == fluid.core.Node.Type.Variable)
+    def test_create_op_node(self):
+        prog = fluid.core.ProgramDesc()
+        block = prog.block(0)
+        sum_op_desc = block.append_op()
+        graph = fluid.core.Graph(prog)
+        node = graph.create_op_node(sum_op_desc)
+        self.assertTrue(node.node_type() == fluid.core.Node.Type.Operation)
+    def test_create_control_dep_var(self):
+        graph = build_graph()
+        name = "__control_var@{}".format(len(graph.nodes()))
+        node = graph.create_control_dep_var()
+        self.assertTrue(node.name() == name)
+    def test_create_empty_node(self):
+        prog = fluid.core.ProgramDesc()
+        graph = fluid.core.Graph(prog)
+        n1 = graph.create_empty_node('x', fluid.core.Node.Type.Operation)
+        self.assertTrue(n1.name() == 'x')
+        n2 = graph.create_empty_node('y', fluid.core.Node.Type.Variable)
+        self.assertTrue(n2.name() == 'y')
+    def test_release_nodes(self):
+        graph = build_graph()
+        nodes = graph.release_nodes()
+        self.assertTrue(len(graph.nodes()) == 0)
+        self.assertTrue({node.name()
+                         for node in nodes} == {"x1", "x2", "out", "sum"})
+    def test_remove_node(self):
+        graph = build_graph()
+        nodes = graph.nodes()
+        for node in nodes:
+            if node.name() == "sum":
+                break
+        self.assertTrue({node.name()
+                         for node in nodes} == {"x1", "x2", "out", "sum"})
+        nodes.remove(node)
+        self.assertTrue({node.name() for node in nodes} == {"x1", "x2", "out"})
+    def test_retrieve_node(self):
+        graph = build_graph()
+        nodes = []
+        for i in range(len(graph.nodes())):
+            nodes.append(graph.retrieve_node(i))
+        for node in nodes:
+            self.assertTrue(node in graph.nodes())
+    def resolve_hazard(self):
+        pass
+def build_graph():
+    prog = fluid.core.ProgramDesc()
+    block = prog.block(0)
+    shape = [10, 20]
+    # prepare input/output
+    x1 = block.var(six.b("x1"))
+    x1.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
+    x1.set_shape(shape)
+    x2 = block.var(six.b("x2"))
+    x2.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
+    x2.set_shape(shape)
+    out = block.var(six.b("out"))
+    out.set_type(fluid.core.VarDesc.VarType.LOD_TENSOR)
+    sum_op_desc = block.append_op()
+    sum_op_desc.set_type("sum")
+    sum_op_desc.set_input("X", ["x1", "x2"])
+    sum_op_desc.set_output("Out", ["out"])
+    sum_op_desc.check_attrs()
+    sum_op_desc.infer_shape(block)
+    graph = fluid.core.Graph(prog)
+    return graph
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -752,12 +752,6 @@ class DistributeTranspiler(object):
            elif op not in lr_ops:
                self._append_pserver_non_opt_ops(block, op)
-        def __op_have_grad_input__(op):
-            for varname in op.input_arg_names:
-                if varname.find("@GRAD") >= 0:
-                    return varname
-            return ""
        def __clone_lr_op_sub_block__(op, program, lr_block):
            if not op.has_attr('sub_block'):
                return
@@ -808,7 +802,7 @@ class DistributeTranspiler(object):
            merged_var = None
            for _, op in enumerate(self.optimize_ops):
                # find the origin grad var before clipping/L2Decay,
-                # merged_var should be the input var name of L2Decaybuil
+                # merged_var should be the input var name of L2Decay
                grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1]
                if op.attr(OP_ROLE_VAR_ATTR_NAME)[
                        0] == optimize_target_param_name:
@@ -1684,7 +1678,16 @@ class DistributeTranspiler(object):
                if self.config.enable_dc_asgd:
                    new_inputs[key] = dc
                else:
-                    new_inputs[key] = merged_var
+                    # Note!! This is for l2decay on sparse gradient, because it will create a new tensor for
+                    # decayed gradient but not inplace modify the origin one
+                    origin_grad_name = opt_op.input(key)[0]
+                    if core.kNewGradSuffix(
+                    ) in origin_grad_name and pserver_block.has_var(
+                            origin_grad_name):
+                        new_grad = pserver_block.var(origin_grad_name)
+                        new_inputs[key] = new_grad
+                    else:
+                        new_inputs[key] = merged_var
            elif key == "Param":
                param_block = _get_param_block(opt_op)
                if not param_block: