better fix

test=develop

better fix
test=develop
d5d09672 · Xin Pan · 5ffb48d6 · d5d09672 · d5d09672 · d5d09672
7 changed file
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -252,9 +252,9 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
  std::vector<ir::Node *> sorted_ret;
  for (size_t i = 0; i < ret.size(); ++i) {
    if (i < last_backward) {
-      if (boost::get<int>(ret[i]->Op()->GetAttr(
-              OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-          static_cast<int>(OpRole::kOptimize)) {
+      if (static_cast<bool>(boost::get<int>(ret[i]->Op()->GetAttr(
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kOptimize))) {
        optimize_ops.push_back(ret[i]);
      } else {
        sorted_ret.push_back(ret[i]);

--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -71,6 +71,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
           static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
           static_cast<int>(OpRole::kLoss) |
               static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kOptimize) |
+               static_cast<int>(OpRole::kLRSched),
           static_cast<int>(OpRole::kNotSpecified)})
      .SetDefault(static_cast<int>(OpRole::kNotSpecified));
  AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),

--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -20,6 +20,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+//////////////////////////
+// Don't add more roles to make this too complicated!
+//////////////////////////
 enum class OpRole {
  kForward = 0x0000,
  kBackward = 0x0001,

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -333,7 +333,8 @@ def append_gradient_clip_ops(param_grads):
    for p, g in param_grads:
        if g is None:
            continue
-        with p.block.program._optimized_guard([p, g]):
+        with p.block.program._optimized_guard(
+            [p, g]), framework.name_scope('append_clip'):
            clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
            if clip_attr is None:
                clip_attr = NullGradientClipAttr()
@@ -348,7 +349,8 @@ def append_gradient_clip_ops(param_grads):
    for p, g in param_grads:
        if g is None:
            continue
-        with p.block.program._optimized_guard([p, g]):
+        with p.block.program._optimized_guard(
+            [p, g]), framework.name_scope('append_graident_clip'):
            res.append(clip_attr._create_operators(param=p, grad=g))

    return res

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1496,6 +1496,9 @@ class Program(object):
            >>> with program._optimized_guard([p,g]):
            >>>     p = p - 0.001 * g
        """
+        tmp_role = self._current_role
+        tmp_var = self._op_role_var
+
        OpRole = core.op_proto_and_checker_maker.OpRole
        self._current_role = OpRole.Optimize
        self._op_role_var = [
@@ -1503,11 +1506,11 @@ class Program(object):
            for var in param_and_grads
        ]
        yield
-        self._op_role_var = []
-        self._current_role = OpRole.Forward
+        self._op_role_var = tmp_var
+        self._current_role = tmp_role

    @contextlib.contextmanager
-    def _lr_schedule_guard(self):
+    def _lr_schedule_guard(self, is_with_opt=False):
        """
        A with guard to set :code:`LRSched` :code:`OpRole` and
        :code:`OpRoleVar` automatically. The :code:`OpRoleVar` is
@@ -1515,6 +1518,10 @@ class Program(object):

        Notes: This is a very low level API. Users should not use it directly.

+        Args:
+            is_with_opt: Only set to true if these ops a in the middle
+                 of a bunch of optimize ops so that it can be treated
+                 correctly. For example, sgd->lr_op->sgd->lr_op->sgd.

        Examples:

@@ -1528,6 +1535,8 @@ class Program(object):

        OpRole = core.op_proto_and_checker_maker.OpRole
        self._current_role = OpRole.LRSched
+        if is_with_opt:
+            self._current_role = int(OpRole.LRSched) | int(OpRole.Optimize)
        # TODO(typhoonzero): how to set target learning rate var
        self._op_role_var = []
        yield

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -111,7 +111,9 @@ class Optimizer(object):
            if param_lr == 1.0:
                return self._global_learning_rate()
            else:
-                with default_main_program()._lr_schedule_guard():
+                with default_main_program()._lr_schedule_guard(
+                        is_with_opt=True), framework.name_scope(
+                            'scale_with_param_lr'):
                    return self._global_learning_rate() * param_lr

    def _create_accumulators(self, block, parameters):
@@ -602,7 +604,8 @@ class AdamOptimizer(Optimizer):
        for param, grad in param_and_grads:
            if grad is None:
                continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope("optimizer"):
                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
                                                      param)
                beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
@@ -740,7 +743,8 @@ class AdamaxOptimizer(Optimizer):
        for param, grad in parameters_and_grads:
            if grad is None:
                continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope('adamx'):
                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
                                                      param)
                main_block.append_op(
@@ -1279,7 +1283,8 @@ class ModelAverage(Optimizer):
        for param, grad in self.params_grads:
            if grad is None:
                continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope('move_average'):
                self._append_average_accumulate_op(param)

        self.apply_program = Program()

--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -47,7 +47,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
        if grad is None:
            params_and_grads.append((param, grad))
            continue
-        with param.block.program._optimized_guard([param, grad]):
+        with param.block.program._optimized_guard(
+            [param, grad]), framework.name_scope('regularization'):
            regularization_term = None
            if param.regularizer is not None:
                # Add variable for regularization term in grad block