From cf1a8f68dd3911198d0ff350c80239a9b181276e Mon Sep 17 00:00:00 2001 From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Date: Wed, 2 Sep 2020 15:37:14 +0800 Subject: [PATCH] cherry-pick try catch (#26880) cherry-pick fix cvm check test=develop Co-authored-by: 123malin --- paddle/fluid/framework/device_worker.cc | 7 ++-- paddle/fluid/framework/device_worker.h | 3 +- paddle/fluid/framework/downpour_worker.cc | 43 +++++++++++++++++++++++ paddle/fluid/framework/hogwild_worker.cc | 1 + paddle/fluid/operators/cvm_op.cc | 8 ----- 5 files changed, 49 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc index 6ba596ab159..a2b9dff2c10 100644 --- a/paddle/fluid/framework/device_worker.cc +++ b/paddle/fluid/framework/device_worker.cc @@ -24,7 +24,7 @@ void DeviceWorker::SetDataFeed(DataFeed* data_feed) { } template -std::string PrintLodTensorType(LoDTensor* tensor, int64_t start, int64_t end) { +std::string PrintLodTensorType(Tensor* tensor, int64_t start, int64_t end) { auto count = tensor->numel(); if (start < 0 || end > count) { VLOG(3) << "access violation"; @@ -37,8 +37,7 @@ std::string PrintLodTensorType(LoDTensor* tensor, int64_t start, int64_t end) { return os.str(); } -std::string PrintLodTensorIntType(LoDTensor* tensor, int64_t start, - int64_t end) { +std::string PrintLodTensorIntType(Tensor* tensor, int64_t start, int64_t end) { auto count = tensor->numel(); if (start < 0 || end > count) { VLOG(3) << "access violation"; @@ -51,7 +50,7 @@ std::string PrintLodTensorIntType(LoDTensor* tensor, int64_t start, return os.str(); } -std::string PrintLodTensor(LoDTensor* tensor, int64_t start, int64_t end) { +std::string PrintLodTensor(Tensor* tensor, int64_t start, int64_t end) { std::string out_val; if (tensor->type() == proto::VarType::FP32) { out_val = PrintLodTensorType(tensor, start, end); diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index f75d7593fe9..8c6c729a33a 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -45,7 +45,7 @@ limitations under the License. */ namespace paddle { namespace framework { -std::string PrintLodTensor(LoDTensor* tensor, int64_t start, int64_t end); +std::string PrintLodTensor(Tensor* tensor, int64_t start, int64_t end); std::pair GetTensorBound(LoDTensor* tensor, int index); bool CheckValidOutput(LoDTensor* tensor, size_t batch_size); @@ -148,6 +148,7 @@ class DeviceWorker { FetchConfig fetch_config_; bool use_cvm_; bool no_cvm_; + std::vector all_param_; }; class CPUWorkerBase : public DeviceWorker { diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index b1a1b73a66e..c36f010a941 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -807,7 +807,50 @@ void DownpourWorker::TrainFiles() { } } if (!need_skip) { +#ifdef PADDLE_WITH_PSLIB + try { + op->Run(*thread_scope_, place_); + } catch (std::exception& e) { + fprintf(stderr, "error message: %s\n", e.what()); + auto& ins_id_vec = device_reader_->GetInsIdVec(); + size_t batch_size = device_reader_->GetCurBatchSize(); + std::string s = ""; + for (auto& ins_id : ins_id_vec) { + if (s != "") s += ","; + s += ins_id; + } + fprintf(stderr, "batch_size: %zu, ins_ids_vec: %s\n", batch_size, + s.c_str()); + s = ""; + for (auto& param : all_param_) { + Variable* var = thread_scope_->FindVar(param); + if (var == nullptr) { + continue; + } + Tensor* tensor = nullptr; + int64_t len = 0; + if (var->IsType()) { + tensor = var->GetMutable(); + len = tensor->numel(); + } else if (var->IsType()) { + auto selected_rows = var->GetMutable(); + tensor = selected_rows->mutable_value(); + len = tensor->numel(); + } + if (!tensor->IsInitialized()) { + continue; + } + s += param + ":" + std::to_string(len) + ":"; + s += PrintLodTensor(tensor, 0, len); + fprintf(stderr, "%s\n", s.c_str()); + fflush(stderr); + s = ""; + } + throw e; + } +#else op->Run(*thread_scope_, place_); +#endif } } diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index db6231e9919..79ecef43087 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -72,6 +72,7 @@ void HogwildWorker::CreateThreadScope(const ProgramDesc &program) { thread_scope_ = &root_scope_->NewScope(); for (auto &var : block.AllVars()) { + all_param_.push_back(var->Name()); if (var->Persistable()) { auto *ptr = root_scope_->Var(var->Name()); InitializeVariable(ptr, var->GetType()); diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc index 155f8f518f9..eae8197b1a1 100644 --- a/paddle/fluid/operators/cvm_op.cc +++ b/paddle/fluid/operators/cvm_op.cc @@ -27,19 +27,11 @@ class CVMOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CVM"); - OP_INOUT_CHECK(ctx->HasInput("CVM"), "Input", "CVM", "CVM"); OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "CVM"); auto x_dims = ctx->GetInputDim("X"); - auto cvm_dims = ctx->GetInputDim("CVM"); PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, platform::errors::InvalidArgument( "Input(X)'s rank should be 2.")); - PADDLE_ENFORCE_EQ( - cvm_dims.size(), 2UL, - platform::errors::InvalidArgument("Input(CVM)'s rank should be 2.")); - PADDLE_ENFORCE_EQ(cvm_dims[1], 2UL, platform::errors::InvalidArgument( - "The 2nd dimension of " - "Input(CVM) should be 2.")); if (ctx->Attrs().Get("use_cvm")) { ctx->SetOutputDim("Y", {x_dims[0], x_dims[1]}); -- GitLab