未验证 提交 b8f17a04 编写于 作者: H hutuxian 提交者: GitHub

fix problem in dump and add log (#24891)

* Fix the field length in LoD scenario
* Fix the missed lod info when copy tensor in dump field
* Add some log to make debug easy
上级 76cdbb84
......@@ -163,26 +163,35 @@ void DeviceWorker::DumpField(const Scope& scope, int dump_mode,
for (auto& field : *dump_fields_) {
Variable* var = scope.FindVar(field);
if (var == nullptr) {
VLOG(0) << "Note: field[" << field
<< "] cannot be find in scope, so it was skipped.";
continue;
}
LoDTensor* tensor = var->GetMutable<LoDTensor>();
if (!tensor->IsInitialized()) {
VLOG(0) << "Note: field[" << field
<< "] is not initialized, so it was skipped.";
continue;
}
framework::LoDTensor cpu_tensor;
if (platform::is_gpu_place(tensor->place())) {
TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
cpu_tensor.set_lod(tensor->lod());
tensor = &cpu_tensor;
}
if (!CheckValidOutput(tensor, batch_size)) {
VLOG(0) << "Note: field[" << field << "] cannot pass check, so it was "
"skipped. Maybe the dimension is "
"wrong ";
continue;
}
for (size_t i = 0; i < batch_size; ++i) {
if (!hit[i]) {
continue;
}
auto output_dim = tensor->dims()[1];
std::string output_dimstr = boost::lexical_cast<std::string>(output_dim);
ars[i] = ars[i] + "\t" + field + ":" + output_dimstr;
auto bound = GetTensorBound(tensor, i);
ars[i] = ars[i] + "\t" + field + ":" +
std::to_string(bound.second - bound.first);
ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
}
}
......
......@@ -266,7 +266,7 @@ class AfsManager {
fcntl(fd_read[0], F_SETFD, FD_CLOEXEC);
fp_read = fdopen(fd_read[0], "r");
PADDLE_ENFORCE_NE(
fp_read, 0,
fp_read, nullptr,
platform::errors::External(
"Failed to open file descriptor via fdopen in AfsManager."));
}
......@@ -276,7 +276,7 @@ class AfsManager {
fcntl(fd_write[1], F_SETFD, FD_CLOEXEC);
fp_write = fdopen(fd_write[1], "w");
PADDLE_ENFORCE_NE(
fp_write, 0,
fp_write, nullptr,
platform::errors::External(
"Failed to open file descriptor via fdopen in AfsManager."));
}
......
......@@ -105,6 +105,7 @@ class TestBoxPSPreload(unittest.TestCase):
name='x', shape=[1], dtype='int64', lod_level=0)
y = fluid.layers.data(
name='y', shape=[1], dtype='int64', lod_level=0)
z = layers.data(name='z', shape=[1], dtype='int64')
emb_x, emb_y = _pull_box_sparse([x, y], size=2)
emb_xp = _pull_box_sparse(x, size=2)
concat = layers.concat([emb_x, emb_y], axis=1)
......@@ -114,7 +115,6 @@ class TestBoxPSPreload(unittest.TestCase):
num_flatten_dims=1,
bias_attr=False)
loss = layers.reduce_mean(fc)
layers.Print(loss)
place = fluid.CPUPlace(
) if is_cpu or not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
......@@ -161,8 +161,14 @@ class TestBoxPSPreload(unittest.TestCase):
sync_steps=-1)
optimizer.minimize(loss)
program._pipeline_opt[
"dump_fields"] = ["fc.tmp_0", "fc.tmp_0@GRAD", "hehe"]
program._pipeline_opt["dump_fields"] = [
"fc.tmp_0", "fc.tmp_0@GRAD", "fake_var", "z",
"reduce_mean_3.tmp_0"
]
# fake_var: not in scope
# z: in scope, but no initialized
# reduce_mean_0.tmp_0, dimension is not right
program._pipeline_opt["dump_fields_path"] = "./dump_log/"
program._pipeline_opt["dump_param"] = ["fc.w_0"]
program._pipeline_opt["enable_random_dump"] = True
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册