未验证 提交 842fb021 编写于 作者: G gongweibao 提交者: GitHub

Fix clone() bug. (#12583)

上级 7b03b18d
...@@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No ...@@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No
paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
......
...@@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const { ...@@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
} }
} }
int OpDesc::GetBlockAttr(const std::string &name) const { std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
auto blocks = boost::get<std::vector<BlockDesc *>>(it->second);
std::vector<int> ids;
for (auto n : blocks) {
ids.push_back(n->ID());
}
return ids;
}
int OpDesc::GetBlockAttrId(const std::string &name) const {
auto it = attrs_.find(name); auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
return boost::get<BlockDesc *>(it->second)->ID(); return boost::get<BlockDesc *>(it->second)->ID();
......
...@@ -83,7 +83,9 @@ class OpDesc { ...@@ -83,7 +83,9 @@ class OpDesc {
Attribute GetNullableAttr(const std::string &name) const; Attribute GetNullableAttr(const std::string &name) const;
int GetBlockAttr(const std::string &name) const; int GetBlockAttrId(const std::string &name) const;
std::vector<int> GetBlocksAttrIds(const std::string &name) const;
void Rename(const std::string &old_name, const std::string &new_name); void Rename(const std::string &old_name, const std::string &new_name);
......
...@@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) { ...@@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
for (const std::string &attr_name : op->AttrNames()) { for (const std::string &attr_name : op->AttrNames()) {
if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) { if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
int sub_block_id = int sub_block_id =
o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name); o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
op->SetBlockAttr(attr_name, MutableBlock(sub_block_id)); op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
} }
} }
......
...@@ -301,7 +301,8 @@ void BindOpDesc(pybind11::module *m) { ...@@ -301,7 +301,8 @@ void BindOpDesc(pybind11::module *m) {
std::string ser(seriralized); std::string ser(seriralized);
self.SetAttr(name, ser); self.SetAttr(name, ser);
}) })
.def("block_attr", &pd::OpDesc::GetBlockAttr) .def("block_attr_id", &pd::OpDesc::GetBlockAttrId)
.def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
.def("check_attrs", &pd::OpDesc::CheckAttrs) .def("check_attrs", &pd::OpDesc::CheckAttrs)
.def("infer_shape", &pd::OpDesc::InferShape) .def("infer_shape", &pd::OpDesc::InferShape)
.def("infer_var_type", &pd::OpDesc::InferVarType) .def("infer_var_type", &pd::OpDesc::InferVarType)
......
...@@ -344,7 +344,7 @@ def _append_backward_ops_(block, ...@@ -344,7 +344,7 @@ def _append_backward_ops_(block,
grad_sub_block_list = [] grad_sub_block_list = []
# If the op has its own sub-block, deal with the sub-block first # If the op has its own sub-block, deal with the sub-block first
if op.has_attr("sub_block"): if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr("sub_block")) sub_block = program.block(op.block_attr_id("sub_block"))
grad_sub_block = program.create_block() grad_sub_block = program.create_block()
grad_sub_block._set_forward_block_idx(sub_block.idx) grad_sub_block._set_forward_block_idx(sub_block.idx)
cb = _callback_lookup_(op) cb = _callback_lookup_(op)
...@@ -406,7 +406,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): ...@@ -406,7 +406,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
for op_idx in range(start_op_idx, block.desc.op_size()): for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx) op_desc = block.desc.op(op_idx)
if op_desc.has_attr("sub_block"): if op_desc.has_attr("sub_block"):
sub_block = block.program.block(op_desc.block_attr("sub_block")) sub_block = block.program.block(op_desc.block_attr_id("sub_block"))
_append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map) _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
new_vars = set() new_vars = set()
# create new gradient variables # create new gradient variables
......
...@@ -476,23 +476,25 @@ class Operator(object): ...@@ -476,23 +476,25 @@ class Operator(object):
attrs=None): attrs=None):
self.block = block self.block = block
self.desc = desc self.desc = desc
self.attrs = attrs # note: not add self.attrs here:
if self.attrs is None: # https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173
self.attrs = dict() op_attrs = attrs
if op_attrs is None:
op_attrs = dict()
del attrs del attrs
op_maker = core.op_proto_and_checker_maker op_maker = core.op_proto_and_checker_maker
if op_maker.kOpRoleAttrName() not in self.attrs: if op_maker.kOpRoleAttrName() not in op_attrs:
self.attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role op_attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
role_var_name = op_maker.kOpRoleVarAttrName() role_var_name = op_maker.kOpRoleVarAttrName()
if len(self.block.program. if len(self.block.program.
op_role_var) != 0 and role_var_name not in self.attrs: op_role_var) != 0 and role_var_name not in op_attrs:
self.attrs[role_var_name] = self.block.program.op_role_var op_attrs[role_var_name] = self.block.program.op_role_var
if role_var_name in self.attrs and len(self.attrs[role_var_name]) == 0: if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
del self.attrs[role_var_name] del op_attrs[role_var_name]
if len(self.desc.type()) != 0: if len(self.desc.type()) != 0:
return return
...@@ -576,15 +578,14 @@ class Operator(object): ...@@ -576,15 +578,14 @@ class Operator(object):
arg.op = self arg.op = self
self.desc.set_output(out_proto.name, out_arg_names) self.desc.set_output(out_proto.name, out_arg_names)
if self.attrs is not None: if op_attrs is not None:
if not isinstance(self.attrs, dict): if not isinstance(op_attrs, dict):
raise TypeError("'attrs' should be a dict.") raise TypeError("'attrs' should be a dict.")
for attr in proto.attrs: for attr in proto.attrs:
attr_name = attr.name attr_name = attr.name
if (attr_name not in self.attrs) or ( if (attr_name not in op_attrs) or (op_attrs[attr_name] is None):
self.attrs[attr_name] is None):
continue continue
attr_val = self.attrs[attr_name] attr_val = op_attrs[attr_name]
self._update_desc_attr(attr_name, attr_val) self._update_desc_attr(attr_name, attr_val)
self.desc.check_attrs() self.desc.check_attrs()
...@@ -732,7 +733,6 @@ class Operator(object): ...@@ -732,7 +733,6 @@ class Operator(object):
Raises: Raises:
ValueError: If the type of value doesn't match with desc.attr_type(name). ValueError: If the type of value doesn't match with desc.attr_type(name).
""" """
self.attrs[name] = val
self._update_desc_attr(name, val) self._update_desc_attr(name, val)
def _update_desc_attr(self, name, val): def _update_desc_attr(self, name, val):
...@@ -774,9 +774,9 @@ class Operator(object): ...@@ -774,9 +774,9 @@ class Operator(object):
""" """
return self.desc.attr(name) return self.desc.attr(name)
def block_attr(self, name): def block_attr_id(self, name):
""" """
Get the block attribute by name. Get the block attribute's id by name.
Args: Args:
name(str): the attribute name. name(str): the attribute name.
...@@ -784,22 +784,74 @@ class Operator(object): ...@@ -784,22 +784,74 @@ class Operator(object):
Returns: Returns:
int: the block index. int: the block index.
""" """
return self.desc.block_attr(name) return self.desc.block_attr_id(name)
def block_attr(self, name):
"""
Get the block attribute by name.
Args:
name(str): the attribute name.
Returns:
block: the block attribute.
"""
id = self.block_attr_id(name)
assert (id >= 0 and id < len(self.block.program.blocks))
return self.block.program.blocks[id]
def blocks_attr(self, name):
"""
Get the blocks attribute by name.
Args:
name(str): the attribute name.
Returns:
list: list of the blocks attribute.
"""
attrs = []
for i in self.blocks_attr_ids(name):
assert (i >= 0 and i < len(self.block.program.blocks))
attrs.append(self.block.program.blocks[i])
return attrs
def blocks_attr_ids(self, name):
"""
Get the blocks attribute's ids by name.
Args:
name(str): the attribute name.
Returns:
list: list of the blocks ids.
"""
return self.desc.blocks_attr_ids(name)
def all_attrs(self): def all_attrs(self):
""" """
Get the attribute dict. Get the attribute dict.
Returns: Returns:
dict: The Operator's attribute dict. dict: The Operator's attribute dict, name->attr.
""" """
attr_names = self.attr_names attr_names = self.attr_names
attr_map = {} attr_map = {}
for n in attr_names: for n in attr_names:
if n == 'sub_block': attr_type = self.desc.attr_type(n)
if attr_type == core.AttrType.BLOCK:
attr_map[n] = self.block_attr(n) attr_map[n] = self.block_attr(n)
else: continue
attr_map[n] = self.attr(n)
if attr_type == core.AttrType.BLOCKS:
attr_map[n] = self.blocks_attr(n)
continue
attr_map[n] = self.attr(n)
return attr_map return attr_map
...@@ -1521,8 +1573,14 @@ class Program(object): ...@@ -1521,8 +1573,14 @@ class Program(object):
p = self.inference_optimize(export_for_deployment=False) p = self.inference_optimize(export_for_deployment=False)
else: else:
p = Program() p = Program()
p.current_block_idx = self.current_block_idx
p._seed = self._seed
p.desc = core.ProgramDesc(self.desc) p.desc = core.ProgramDesc(self.desc)
p.blocks = [Block(p, i) for i in range(self.desc.num_blocks())] p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
p._current_role = self._current_role
p._op_role_var = self._op_role_var
p._sync_with_cpp() p._sync_with_cpp()
p._copy_param_info_from(self) p._copy_param_info_from(self)
......
...@@ -264,7 +264,8 @@ class NormalInitializer(Initializer): ...@@ -264,7 +264,8 @@ class NormalInitializer(Initializer):
"dtype": int(var.dtype), "dtype": int(var.dtype),
"mean": self._mean, "mean": self._mean,
"std": self._std_dev, "std": self._std_dev,
"seed": self._seed "seed": self._seed,
"use_mkldnn": False
}) })
var.op = op var.op = op
return op return op
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
import collections
SEED = 1
DTYPE = "float32"
paddle.dataset.mnist.fetch()
# random seed must set before configuring the network.
# fluid.default_startup_program().random_seed = SEED
def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
# TODO(dzhwinter) : refine the initializer and random seed settting
SIZE = 10
input_shape = conv_pool_2.shape
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
predict = fluid.layers.fc(
input=conv_pool_2,
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)))
return predict
def get_model(batch_size):
# Input data
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
opt.minimize(avg_cost)
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=trainer_id,
program=main_program,
pservers=pserver_endpoints,
trainers=trainers)
return t
def operator_equal(a, b):
for k, v in a.__dict__.iteritems():
if isinstance(v, fluid.framework.Program) or \
isinstance(v, fluid.framework.Block):
continue
elif isinstance(v, core.OpDesc):
if v.serialize_to_string() != b.__dict__[k].serialize_to_string():
raise ValueError("In operator_equal not equal:{0}\n".format(k))
elif isinstance(v, collections.OrderedDict):
v0 = sorted(v.iteritems(), key=lambda x: x[0])
v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
if v0 != v1:
raise ValueError("In operator_equal not equal:{0}\n".format(k))
elif (v != b.__dict__[k]):
raise ValueError("In operator_equal not equal:{0}\n".format(k))
return True
def block_equal(a, b):
for k, v in a.__dict__.iteritems():
if isinstance(v, core.ProgramDesc) or isinstance(
v, fluid.framework.Program) or isinstance(v, core.BlockDesc):
continue
elif k == "ops":
for i in range(0, len(a.ops)):
if not operator_equal(a.ops[i], b.ops[i]):
raise ValueError("In block_equal not equal:{0}\n".format(k))
assert (len(a.ops) == len(b.ops))
elif isinstance(v, collections.OrderedDict):
v0 = sorted(v.iteritems(), key=lambda x: x[0])
v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
if v0 != v1:
raise ValueError("In block_equal not equal:{0}\n".format(k))
elif (v != b.__dict__[k]):
raise ValueError("In block_equal not equal:{0}\n".format(k))
return True
def program_equal(a, b):
for k, v in a.__dict__.iteritems():
if isinstance(v, core.ProgramDesc):
continue
elif k == 'blocks':
for i in range(0, len(a.blocks)):
if not block_equal(a.blocks[i], b.blocks[i]):
raise ValueError("In operator_equal not equal:{0}\n".format(
k))
return False
assert (len(a.blocks) == len(b.blocks))
elif (v != b.__dict__[k]):
raise ValueError("In program_equal not equal:{0}\n".format(k))
return True
class TestDistMnist(unittest.TestCase):
def test_desc_clone(self):
get_model(batch_size=20)
pserver_endpoints = "127.0.0.1:9123"
trainers = 1
current_endpoint = "127.0.0.1:9123"
t = get_transpiler(0,
fluid.default_main_program(), pserver_endpoints,
trainers)
pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
main = pserver_prog.clone()
startup = startup_prog.clone()
self.assertTrue(program_equal(main, pserver_prog))
self.assertTrue(program_equal(startup, startup_prog))
if __name__ == "__main__":
unittest.main()
...@@ -130,7 +130,7 @@ class TestDistBase(unittest.TestCase): ...@@ -130,7 +130,7 @@ class TestDistBase(unittest.TestCase):
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
self._python_interp = "python" self._python_interp = "python"
def start_pserver(self, model_file): def start_pserver(self, model_file, check_error_log):
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep, (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
...@@ -139,11 +139,23 @@ class TestDistBase(unittest.TestCase): ...@@ -139,11 +139,23 @@ class TestDistBase(unittest.TestCase):
(self._python_interp, model_file, self._ps_endpoints, ps1_ep, (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
self._trainers) self._trainers)
ps0_pipe = subprocess.PIPE
ps1_pipe = subprocess.PIPE
if check_error_log:
print("ps0_cmd:", ps0_cmd)
print("ps1_cmd:", ps1_cmd)
ps0_pipe = open("/tmp/ps0_err.log", "wb")
ps1_pipe = open("/tmp/ps1_err.log", "wb")
ps0_proc = subprocess.Popen( ps0_proc = subprocess.Popen(
ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
ps1_proc = subprocess.Popen( ps1_proc = subprocess.Popen(
ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)
return ps0_proc, ps1_proc
if not check_error_log:
return ps0_proc, ps1_proc, None, None
else:
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
def _wait_ps_ready(self, pid): def _wait_ps_ready(self, pid):
retry_times = 50 retry_times = 50
...@@ -160,7 +172,7 @@ class TestDistBase(unittest.TestCase): ...@@ -160,7 +172,7 @@ class TestDistBase(unittest.TestCase):
(e, retry_times)) (e, retry_times))
retry_times -= 1 retry_times -= 1
def check_with_place(self, model_file, delta=1e-3): def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs = { required_envs = {
"PATH": os.getenv("PATH"), "PATH": os.getenv("PATH"),
...@@ -169,17 +181,32 @@ class TestDistBase(unittest.TestCase): ...@@ -169,17 +181,32 @@ class TestDistBase(unittest.TestCase):
"FLAGS_fraction_of_gpu_memory_to_use": "0.15", "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1" "FLAGS_cudnn_deterministic": "1"
} }
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1"
# Run local to get a base line # Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"} env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs) env_local.update(required_envs)
local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \ local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
(self._python_interp, model_file, (self._python_interp, model_file,
"127.0.0.1:1234", "127.0.0.1:1234", 1) "127.0.0.1:1234", "127.0.0.1:1234", 1)
local_proc = subprocess.Popen( if not check_error_log:
local_cmd.split(" "), local_proc = subprocess.Popen(
stdout=subprocess.PIPE, local_cmd.split(" "),
stderr=subprocess.PIPE, stdout=subprocess.PIPE,
env=env_local) stderr=subprocess.PIPE,
env=env_local)
else:
print("trainer cmd:", local_cmd)
err_log = open("/tmp/trainer.err.log", "wb")
local_proc = subprocess.Popen(
local_cmd.split(" "),
stdout=subprocess.PIPE,
stderr=err_log,
env=env_local)
local_proc.wait() local_proc.wait()
out, err = local_proc.communicate() out, err = local_proc.communicate()
local_ret = out local_ret = out
...@@ -187,7 +214,8 @@ class TestDistBase(unittest.TestCase): ...@@ -187,7 +214,8 @@ class TestDistBase(unittest.TestCase):
sys.stderr.write('local_stderr: %s\n' % err) sys.stderr.write('local_stderr: %s\n' % err)
# Run dist train to compare with local results # Run dist train to compare with local results
ps0, ps1 = self.start_pserver(model_file) ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
check_error_log)
self._wait_ps_ready(ps0.pid) self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid) self._wait_ps_ready(ps1.pid)
...@@ -205,15 +233,23 @@ class TestDistBase(unittest.TestCase): ...@@ -205,15 +233,23 @@ class TestDistBase(unittest.TestCase):
env1.update(required_envs) env1.update(required_envs)
FNULL = open(os.devnull, 'w') FNULL = open(os.devnull, 'w')
tr0_pipe = subprocess.PIPE
tr1_pipe = subprocess.PIPE
if check_error_log:
print("tr0_cmd:", tr0_cmd)
print("tr1_cmd:", tr1_cmd)
tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb")
tr0_proc = subprocess.Popen( tr0_proc = subprocess.Popen(
tr0_cmd.split(" "), tr0_cmd.split(" "),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=tr0_pipe,
env=env0) env=env0)
tr1_proc = subprocess.Popen( tr1_proc = subprocess.Popen(
tr1_cmd.split(" "), tr1_cmd.split(" "),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=tr1_pipe,
env=env1) env=env1)
tr0_proc.wait() tr0_proc.wait()
...@@ -230,6 +266,13 @@ class TestDistBase(unittest.TestCase): ...@@ -230,6 +266,13 @@ class TestDistBase(unittest.TestCase):
local_first_loss = eval(local_lines[0])[0] local_first_loss = eval(local_lines[0])[0]
local_last_loss = eval(local_lines[1])[0] local_last_loss = eval(local_lines[1])[0]
# close trainer file
if check_error_log:
tr0_pipe.close()
tr1_pipe.close()
ps0_pipe.close()
ps1_pipe.close()
# FIXME: use terminate() instead of sigkill. # FIXME: use terminate() instead of sigkill.
os.kill(ps0.pid, signal.SIGKILL) os.kill(ps0.pid, signal.SIGKILL)
os.kill(ps1.pid, signal.SIGKILL) os.kill(ps1.pid, signal.SIGKILL)
......
...@@ -259,7 +259,7 @@ class TestLRDecayConditional(TranspilerTest): ...@@ -259,7 +259,7 @@ class TestLRDecayConditional(TranspilerTest):
serv_op = pserver.blocks[0].ops[0] serv_op = pserver.blocks[0].ops[0]
sub_blocks = [] sub_blocks = []
optimize_blocks = [] optimize_blocks = []
for b in serv_op.attrs["optimize_blocks"]: for b in serv_op.all_attrs()["optimize_blocks"]:
optimize_blocks.append(b.idx) optimize_blocks.append(b.idx)
for b in pserver.blocks: for b in pserver.blocks:
if b.idx not in optimize_blocks: if b.idx not in optimize_blocks:
......
...@@ -68,7 +68,7 @@ class TestOpDesc(unittest.TestCase): ...@@ -68,7 +68,7 @@ class TestOpDesc(unittest.TestCase):
self.assertEqual(8, len(op.attr_names())) self.assertEqual(8, len(op.attr_names()))
op.set_block_attr("block_attr", program_desc.block(0)) op.set_block_attr("block_attr", program_desc.block(0))
self.assertEqual(0, op.block_attr("block_attr")) self.assertEqual(0, op.block_attr_id("block_attr"))
mul_op = block.append_op() mul_op = block.append_op()
mul_op.set_type("mul") mul_op.set_type("mul")
......
...@@ -584,12 +584,12 @@ class DistributeTranspiler(object): ...@@ -584,12 +584,12 @@ class DistributeTranspiler(object):
if op.type in [ if op.type in [
"gaussian_random", "fill_constant", "uniform_random" "gaussian_random", "fill_constant", "uniform_random"
]: ]:
op.attrs["shape"] = new_outputs["Out"].shape op.set_attr("shape", list(new_outputs["Out"].shape))
s_prog.global_block().append_op( s_prog.global_block().append_op(
type=op.type, type=op.type,
inputs=new_inputs, inputs=new_inputs,
outputs=new_outputs, outputs=new_outputs,
attrs=op.attrs) attrs=op.all_attrs())
return s_prog return s_prog
# ====================== private transpiler functions ===================== # ====================== private transpiler functions =====================
...@@ -603,7 +603,7 @@ class DistributeTranspiler(object): ...@@ -603,7 +603,7 @@ class DistributeTranspiler(object):
self.table_name = None self.table_name = None
for op in self.origin_program.global_block().ops: for op in self.origin_program.global_block().ops:
if op.type == LOOKUP_TABLE_TYPE: if op.type == LOOKUP_TABLE_TYPE:
if op.attrs['is_distributed'] is True: if op.attr('is_distributed') is True:
if self.table_name is None: if self.table_name is None:
self.table_name = op.input("W")[0] self.table_name = op.input("W")[0]
if self.table_name != op.input("W")[0]: if self.table_name != op.input("W")[0]:
...@@ -1263,7 +1263,7 @@ class DistributeTranspiler(object): ...@@ -1263,7 +1263,7 @@ class DistributeTranspiler(object):
type=opt_op.type, type=opt_op.type,
inputs=new_inputs, inputs=new_inputs,
outputs=outputs, outputs=outputs,
attrs=opt_op.attrs) attrs=opt_op.all_attrs())
def _is_splited_grad_var(self, var, var_dict): def _is_splited_grad_var(self, var, var_dict):
grad_block = None grad_block = None
...@@ -1296,7 +1296,7 @@ class DistributeTranspiler(object): ...@@ -1296,7 +1296,7 @@ class DistributeTranspiler(object):
block._clone_variable(var) block._clone_variable(var)
return block.append_op( return block.append_op(
type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs) type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
def _append_pserver_non_opt_ops(self, optimize_block, opt_op): def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
program = optimize_block.program program = optimize_block.program
...@@ -1337,7 +1337,7 @@ class DistributeTranspiler(object): ...@@ -1337,7 +1337,7 @@ class DistributeTranspiler(object):
type=opt_op.type, type=opt_op.type,
inputs=inputs, inputs=inputs,
outputs=outputs, outputs=outputs,
attrs=opt_op.attrs) attrs=opt_op.all_attrs())
def _is_op_connected(self, op1, op2): def _is_op_connected(self, op1, op2):
# If one op's input is another op's output or # If one op's input is another op's output or
...@@ -1442,8 +1442,8 @@ class DistributeTranspiler(object): ...@@ -1442,8 +1442,8 @@ class DistributeTranspiler(object):
# optimize # optimize
op_maker = core.op_proto_and_checker_maker op_maker = core.op_proto_and_checker_maker
optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
if op_maker.kOpRoleAttrName() in op.attrs and \ if op_maker.kOpRoleAttrName() in op.attr_names and \
int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role): int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
return True return True
return False return False
...@@ -1466,8 +1466,8 @@ class DistributeTranspiler(object): ...@@ -1466,8 +1466,8 @@ class DistributeTranspiler(object):
# and op_role_var to get the pair. # and op_role_var to get the pair.
for input_name in op.input_arg_names: for input_name in op.input_arg_names:
if input_name.find("@GRAD") != -1 and \ if input_name.find("@GRAD") != -1 and \
op.attrs[RPC_OP_ROLE_ATTR_NAME]: op.attr(RPC_OP_ROLE_ATTR_NAME):
param_name = op.attrs[OP_ROLE_VAR_ATTR_NAME][0] param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
params_grads.append([ params_grads.append([
origin_var_dict[param_name], origin_var_dict[param_name],
origin_var_dict[input_name] origin_var_dict[input_name]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册