Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into accelerate_embedding_grad

a2820b98 · minqiyang · a61879a8 · 18be7256 · a2820b98 · a2820b98
10 changed file
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -252,9 +252,9 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
  std::vector<ir::Node *> sorted_ret;
  for (size_t i = 0; i < ret.size(); ++i) {
    if (i < last_backward) {
-      if (boost::get<int>(ret[i]->Op()->GetAttr(
+      if (static_cast<bool>(boost::get<int>(ret[i]->Op()->GetAttr(
-              OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-          static_cast<int>(OpRole::kOptimize)) {
+                            static_cast<int>(OpRole::kOptimize))) {
        optimize_ops.push_back(ret[i]);
      } else {
        sorted_ret.push_back(ret[i]);

--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -71,6 +71,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
           static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
           static_cast<int>(OpRole::kLoss) |
               static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kOptimize) |
+               static_cast<int>(OpRole::kLRSched),
           static_cast<int>(OpRole::kNotSpecified)})
      .SetDefault(static_cast<int>(OpRole::kNotSpecified));
  AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),

--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -20,6 +20,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
+//////////////////////////
+// Don't add more roles to make this too complicated!
+//////////////////////////
 enum class OpRole {
  kForward = 0x0000,
  kBackward = 0x0001,

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -156,12 +156,6 @@ ParallelExecutor::ParallelExecutor(
                           params, member_->local_scopes_, member_->use_cuda_);
 #endif
-  // If the loss_var_name is given, the number of graph should be only one.
-  if (loss_var_name.size()) {
-    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
-                      "The number of graph should be only one");
-  }
  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
        exec_strategy, member_->local_scopes_, places, std::move(graph)));

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -333,7 +333,8 @@ def append_gradient_clip_ops(param_grads):
    for p, g in param_grads:
        if g is None:
            continue
-        with p.block.program._optimized_guard([p, g]):
+        with p.block.program._optimized_guard(
+            [p, g]), framework.name_scope('append_clip'):
            clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
            if clip_attr is None:
                clip_attr = NullGradientClipAttr()
@@ -348,7 +349,8 @@ def append_gradient_clip_ops(param_grads):
    for p, g in param_grads:
        if g is None:
            continue
-        with p.block.program._optimized_guard([p, g]):
+        with p.block.program._optimized_guard(
+            [p, g]), framework.name_scope('append_graident_clip'):
            res.append(clip_attr._create_operators(param=p, grad=g))
    return res

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1496,6 +1496,9 @@ class Program(object):
            >>> with program._optimized_guard([p,g]):
            >>>     p = p - 0.001 * g
        """
+        tmp_role = self._current_role
+        tmp_var = self._op_role_var
        OpRole = core.op_proto_and_checker_maker.OpRole
        self._current_role = OpRole.Optimize
        self._op_role_var = [
@@ -1503,11 +1506,11 @@ class Program(object):
            for var in param_and_grads
        ]
        yield
-        self._op_role_var = []
+        self._op_role_var = tmp_var
-        self._current_role = OpRole.Forward
+        self._current_role = tmp_role
    @contextlib.contextmanager
-    def _lr_schedule_guard(self):
+    def _lr_schedule_guard(self, is_with_opt=False):
        """
        A with guard to set :code:`LRSched` :code:`OpRole` and
        :code:`OpRoleVar` automatically. The :code:`OpRoleVar` is
@@ -1515,6 +1518,10 @@ class Program(object):
        Notes: This is a very low level API. Users should not use it directly.
+        Args:
+            is_with_opt: Only set to true if these ops a in the middle
+                 of a bunch of optimize ops so that it can be treated
+                 correctly. For example, sgd->lr_op->sgd->lr_op->sgd.
        Examples:
@@ -1528,6 +1535,8 @@ class Program(object):
        OpRole = core.op_proto_and_checker_maker.OpRole
        self._current_role = OpRole.LRSched
+        if is_with_opt:
+            self._current_role = int(OpRole.LRSched) | int(OpRole.Optimize)
        # TODO(typhoonzero): how to set target learning rate var
        self._op_role_var = []
        yield

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -111,7 +111,9 @@ class Optimizer(object):
            if param_lr == 1.0:
                return self._global_learning_rate()
            else:
-                with default_main_program()._lr_schedule_guard():
+                with default_main_program()._lr_schedule_guard(
+                        is_with_opt=True), framework.name_scope(
+                            'scale_with_param_lr'):
                    return self._global_learning_rate() * param_lr
    def _create_accumulators(self, block, parameters):
@@ -602,7 +604,8 @@ class AdamOptimizer(Optimizer):
        for param, grad in param_and_grads:
            if grad is None:
                continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope("optimizer"):
                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
                                                      param)
                beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
@@ -740,7 +743,8 @@ class AdamaxOptimizer(Optimizer):
        for param, grad in parameters_and_grads:
            if grad is None:
                continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope('adamx'):
                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
                                                      param)
                main_block.append_op(
@@ -1279,7 +1283,8 @@ class ModelAverage(Optimizer):
        for param, grad in self.params_grads:
            if grad is None:
                continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope('move_average'):
                self._append_average_accumulate_op(param)
        self.apply_program = Program()

--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -47,7 +47,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
        if grad is None:
            params_and_grads.append((param, grad))
            continue
-        with param.block.program._optimized_guard([param, grad]):
+        with param.block.program._optimized_guard(
+            [param, grad]), framework.name_scope('regularization'):
            regularization_term = None
            if param.regularizer is not None:
                # Add variable for regularization term in grad block

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -78,9 +78,9 @@ if(WITH_DISTRIBUTE)
        set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
        py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
        set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
+        # FIXME(typhoonzero): add this back
-        py_test_modules(test_dist_transformer MODULES test_dist_transformer)
+	#py_test_modules(test_dist_transformer MODULES test_dist_transformer)
-        set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
+	#set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
    endif(NOT APPLE)
    py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -49,6 +49,7 @@ LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
 OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
 RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
 )
+OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
 RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
 DIST_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Dist
 LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
@@ -1717,8 +1718,10 @@ to transpile() call.")
        lr_ops = []
        block = self.origin_program.global_block()
        for op in block.ops:
-            if int(op.attr(RPC_OP_ROLE_ATTR_NAME)) == int(
+            role_id = int(op.attr(RPC_OP_ROLE_ATTR_NAME))
-                    LR_SCHED_OP_ROLE_ATTR_VALUE):
+            if role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) or \
+                role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) | \
+                    int(OPT_OP_ROLE_ATTR_VALUE):
                lr_ops.append(op)
                log("append lr op: ", op.type)
        return lr_ops