提交 a2820b98 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into accelerate_embedding_grad

...@@ -252,9 +252,9 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { ...@@ -252,9 +252,9 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
std::vector<ir::Node *> sorted_ret; std::vector<ir::Node *> sorted_ret;
for (size_t i = 0; i < ret.size(); ++i) { for (size_t i = 0; i < ret.size(); ++i) {
if (i < last_backward) { if (i < last_backward) {
if (boost::get<int>(ret[i]->Op()->GetAttr( if (static_cast<bool>(boost::get<int>(ret[i]->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) == OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kOptimize)) { static_cast<int>(OpRole::kOptimize))) {
optimize_ops.push_back(ret[i]); optimize_ops.push_back(ret[i]);
} else { } else {
sorted_ret.push_back(ret[i]); sorted_ret.push_back(ret[i]);
......
...@@ -71,6 +71,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, ...@@ -71,6 +71,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward), static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kLoss) |
static_cast<int>(OpRole::kBackward), static_cast<int>(OpRole::kBackward),
static_cast<int>(OpRole::kOptimize) |
static_cast<int>(OpRole::kLRSched),
static_cast<int>(OpRole::kNotSpecified)}) static_cast<int>(OpRole::kNotSpecified)})
.SetDefault(static_cast<int>(OpRole::kNotSpecified)); .SetDefault(static_cast<int>(OpRole::kNotSpecified));
AddAttr<std::vector<std::string>>(OpRoleVarAttrName(), AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),
......
...@@ -20,6 +20,9 @@ limitations under the License. */ ...@@ -20,6 +20,9 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
//////////////////////////
// Don't add more roles to make this too complicated!
//////////////////////////
enum class OpRole { enum class OpRole {
kForward = 0x0000, kForward = 0x0000,
kBackward = 0x0001, kBackward = 0x0001,
......
...@@ -156,12 +156,6 @@ ParallelExecutor::ParallelExecutor( ...@@ -156,12 +156,6 @@ ParallelExecutor::ParallelExecutor(
params, member_->local_scopes_, member_->use_cuda_); params, member_->local_scopes_, member_->use_cuda_);
#endif #endif
// If the loss_var_name is given, the number of graph should be only one.
if (loss_var_name.size()) {
PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
"The number of graph should be only one");
}
if (exec_strategy.type_ == ExecutionStrategy::kDefault) { if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
member_->executor_.reset(new details::ThreadedSSAGraphExecutor( member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places, std::move(graph))); exec_strategy, member_->local_scopes_, places, std::move(graph)));
......
...@@ -333,7 +333,8 @@ def append_gradient_clip_ops(param_grads): ...@@ -333,7 +333,8 @@ def append_gradient_clip_ops(param_grads):
for p, g in param_grads: for p, g in param_grads:
if g is None: if g is None:
continue continue
with p.block.program._optimized_guard([p, g]): with p.block.program._optimized_guard(
[p, g]), framework.name_scope('append_clip'):
clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr()) clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
if clip_attr is None: if clip_attr is None:
clip_attr = NullGradientClipAttr() clip_attr = NullGradientClipAttr()
...@@ -348,7 +349,8 @@ def append_gradient_clip_ops(param_grads): ...@@ -348,7 +349,8 @@ def append_gradient_clip_ops(param_grads):
for p, g in param_grads: for p, g in param_grads:
if g is None: if g is None:
continue continue
with p.block.program._optimized_guard([p, g]): with p.block.program._optimized_guard(
[p, g]), framework.name_scope('append_graident_clip'):
res.append(clip_attr._create_operators(param=p, grad=g)) res.append(clip_attr._create_operators(param=p, grad=g))
return res return res
......
...@@ -1496,6 +1496,9 @@ class Program(object): ...@@ -1496,6 +1496,9 @@ class Program(object):
>>> with program._optimized_guard([p,g]): >>> with program._optimized_guard([p,g]):
>>> p = p - 0.001 * g >>> p = p - 0.001 * g
""" """
tmp_role = self._current_role
tmp_var = self._op_role_var
OpRole = core.op_proto_and_checker_maker.OpRole OpRole = core.op_proto_and_checker_maker.OpRole
self._current_role = OpRole.Optimize self._current_role = OpRole.Optimize
self._op_role_var = [ self._op_role_var = [
...@@ -1503,11 +1506,11 @@ class Program(object): ...@@ -1503,11 +1506,11 @@ class Program(object):
for var in param_and_grads for var in param_and_grads
] ]
yield yield
self._op_role_var = [] self._op_role_var = tmp_var
self._current_role = OpRole.Forward self._current_role = tmp_role
@contextlib.contextmanager @contextlib.contextmanager
def _lr_schedule_guard(self): def _lr_schedule_guard(self, is_with_opt=False):
""" """
A with guard to set :code:`LRSched` :code:`OpRole` and A with guard to set :code:`LRSched` :code:`OpRole` and
:code:`OpRoleVar` automatically. The :code:`OpRoleVar` is :code:`OpRoleVar` automatically. The :code:`OpRoleVar` is
...@@ -1515,6 +1518,10 @@ class Program(object): ...@@ -1515,6 +1518,10 @@ class Program(object):
Notes: This is a very low level API. Users should not use it directly. Notes: This is a very low level API. Users should not use it directly.
Args:
is_with_opt: Only set to true if these ops a in the middle
of a bunch of optimize ops so that it can be treated
correctly. For example, sgd->lr_op->sgd->lr_op->sgd.
Examples: Examples:
...@@ -1528,6 +1535,8 @@ class Program(object): ...@@ -1528,6 +1535,8 @@ class Program(object):
OpRole = core.op_proto_and_checker_maker.OpRole OpRole = core.op_proto_and_checker_maker.OpRole
self._current_role = OpRole.LRSched self._current_role = OpRole.LRSched
if is_with_opt:
self._current_role = int(OpRole.LRSched) | int(OpRole.Optimize)
# TODO(typhoonzero): how to set target learning rate var # TODO(typhoonzero): how to set target learning rate var
self._op_role_var = [] self._op_role_var = []
yield yield
......
...@@ -111,7 +111,9 @@ class Optimizer(object): ...@@ -111,7 +111,9 @@ class Optimizer(object):
if param_lr == 1.0: if param_lr == 1.0:
return self._global_learning_rate() return self._global_learning_rate()
else: else:
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard(
is_with_opt=True), framework.name_scope(
'scale_with_param_lr'):
return self._global_learning_rate() * param_lr return self._global_learning_rate() * param_lr
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
...@@ -602,7 +604,8 @@ class AdamOptimizer(Optimizer): ...@@ -602,7 +604,8 @@ class AdamOptimizer(Optimizer):
for param, grad in param_and_grads: for param, grad in param_and_grads:
if grad is None: if grad is None:
continue continue
with param.block.program._optimized_guard([param, grad]): with param.block.program._optimized_guard(
[param, grad]), name_scope("optimizer"):
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param) param)
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
...@@ -740,7 +743,8 @@ class AdamaxOptimizer(Optimizer): ...@@ -740,7 +743,8 @@ class AdamaxOptimizer(Optimizer):
for param, grad in parameters_and_grads: for param, grad in parameters_and_grads:
if grad is None: if grad is None:
continue continue
with param.block.program._optimized_guard([param, grad]): with param.block.program._optimized_guard(
[param, grad]), name_scope('adamx'):
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param) param)
main_block.append_op( main_block.append_op(
...@@ -1279,7 +1283,8 @@ class ModelAverage(Optimizer): ...@@ -1279,7 +1283,8 @@ class ModelAverage(Optimizer):
for param, grad in self.params_grads: for param, grad in self.params_grads:
if grad is None: if grad is None:
continue continue
with param.block.program._optimized_guard([param, grad]): with param.block.program._optimized_guard(
[param, grad]), name_scope('move_average'):
self._append_average_accumulate_op(param) self._append_average_accumulate_op(param)
self.apply_program = Program() self.apply_program = Program()
......
...@@ -47,7 +47,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None): ...@@ -47,7 +47,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
if grad is None: if grad is None:
params_and_grads.append((param, grad)) params_and_grads.append((param, grad))
continue continue
with param.block.program._optimized_guard([param, grad]): with param.block.program._optimized_guard(
[param, grad]), framework.name_scope('regularization'):
regularization_term = None regularization_term = None
if param.regularizer is not None: if param.regularizer is not None:
# Add variable for regularization term in grad block # Add variable for regularization term in grad block
......
...@@ -78,9 +78,9 @@ if(WITH_DISTRIBUTE) ...@@ -78,9 +78,9 @@ if(WITH_DISTRIBUTE)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
# FIXME(typhoonzero): add this back
py_test_modules(test_dist_transformer MODULES test_dist_transformer) #py_test_modules(test_dist_transformer MODULES test_dist_transformer)
set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) #set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
endif(NOT APPLE) endif(NOT APPLE)
py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
endif() endif()
......
...@@ -49,6 +49,7 @@ LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad" ...@@ -49,6 +49,7 @@ LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName() OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName( RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
) )
OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
DIST_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Dist DIST_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Dist
LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
...@@ -1717,8 +1718,10 @@ to transpile() call.") ...@@ -1717,8 +1718,10 @@ to transpile() call.")
lr_ops = [] lr_ops = []
block = self.origin_program.global_block() block = self.origin_program.global_block()
for op in block.ops: for op in block.ops:
if int(op.attr(RPC_OP_ROLE_ATTR_NAME)) == int( role_id = int(op.attr(RPC_OP_ROLE_ATTR_NAME))
LR_SCHED_OP_ROLE_ATTR_VALUE): if role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) or \
role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) | \
int(OPT_OP_ROLE_ATTR_VALUE):
lr_ops.append(op) lr_ops.append(op)
log("append lr op: ", op.type) log("append lr op: ", op.type)
return lr_ops return lr_ops
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册